From 2968c846cef97ae720d55ec669bb440411dcfee6 Mon Sep 17 00:00:00 2001
From: Matej Marinko <matej.marinko@soniox.com>
Date: Wed, 28 May 2025 09:35:21 +0200
Subject: [PATCH 01/18] Add Soniox STT service

---
 README.md                                     |   2 +-
 docs/api/conf.py                              |   1 +
 docs/api/requirements.txt                     |   1 +
 dot-env.template                              |   5 +-
 .../foundational/07za-interruptible-soniox.py | 115 ++++++
 .../foundational/13f-soniox-transcription.py  |  77 ++++
 pyproject.toml                                |   1 +
 src/pipecat/services/soniox/__init__.py       |  13 +
 src/pipecat/services/soniox/config.py         |  36 ++
 src/pipecat/services/soniox/stt.py            | 368 ++++++++++++++++++
 10 files changed, 617 insertions(+), 2 deletions(-)
 create mode 100644 examples/foundational/07za-interruptible-soniox.py
 create mode 100644 examples/foundational/13f-soniox-transcription.py
 create mode 100644 src/pipecat/services/soniox/__init__.py
 create mode 100644 src/pipecat/services/soniox/config.py
 create mode 100644 src/pipecat/services/soniox/stt.py

diff --git a/README.md b/README.md
index 3966e8d65..6833cf7c5 100644
--- a/README.md
+++ b/README.md
@@ -53,7 +53,7 @@ You can connect to Pipecat from any platform using our official SDKs:
 
 | Category            | Services                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  |
 | ------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| Speech-to-Text      | [AssemblyAI](https://docs.pipecat.ai/server/services/stt/assemblyai), [AWS](https://docs.pipecat.ai/server/services/stt/aws), [Azure](https://docs.pipecat.ai/server/services/stt/azure), [Deepgram](https://docs.pipecat.ai/server/services/stt/deepgram), [Fal Wizper](https://docs.pipecat.ai/server/services/stt/fal), [Gladia](https://docs.pipecat.ai/server/services/stt/gladia), [Google](https://docs.pipecat.ai/server/services/stt/google), [Groq (Whisper)](https://docs.pipecat.ai/server/services/stt/groq), [OpenAI (Whisper)](https://docs.pipecat.ai/server/services/stt/openai), [Parakeet (NVIDIA)](https://docs.pipecat.ai/server/services/stt/parakeet), [Ultravox](https://docs.pipecat.ai/server/services/stt/ultravox), [Whisper](https://docs.pipecat.ai/server/services/stt/whisper)                                                                                                                                                                                                                                                                                            |
+| Speech-to-Text      | [AssemblyAI](https://docs.pipecat.ai/server/services/stt/assemblyai), [AWS](https://docs.pipecat.ai/server/services/stt/aws), [Azure](https://docs.pipecat.ai/server/services/stt/azure), [Deepgram](https://docs.pipecat.ai/server/services/stt/deepgram), [Fal Wizper](https://docs.pipecat.ai/server/services/stt/fal), [Gladia](https://docs.pipecat.ai/server/services/stt/gladia), [Google](https://docs.pipecat.ai/server/services/stt/google), [Groq (Whisper)](https://docs.pipecat.ai/server/services/stt/groq), [OpenAI (Whisper)](https://docs.pipecat.ai/server/services/stt/openai), [Parakeet (NVIDIA)](https://docs.pipecat.ai/server/services/stt/parakeet), [Soniox](https://docs.pipecat.ai/server/services/stt/soniox), [Ultravox](https://docs.pipecat.ai/server/services/stt/ultravox), [Whisper](https://docs.pipecat.ai/server/services/stt/whisper)                                                                                                                                                                                                                              |
 | LLMs                | [Anthropic](https://docs.pipecat.ai/server/services/llm/anthropic), [AWS](https://docs.pipecat.ai/server/services/llm/aws), [Azure](https://docs.pipecat.ai/server/services/llm/azure), [Cerebras](https://docs.pipecat.ai/server/services/llm/cerebras), [DeepSeek](https://docs.pipecat.ai/server/services/llm/deepseek), [Fireworks AI](https://docs.pipecat.ai/server/services/llm/fireworks), [Gemini](https://docs.pipecat.ai/server/services/llm/gemini), [Grok](https://docs.pipecat.ai/server/services/llm/grok), [Groq](https://docs.pipecat.ai/server/services/llm/groq), [NVIDIA NIM](https://docs.pipecat.ai/server/services/llm/nim), [Ollama](https://docs.pipecat.ai/server/services/llm/ollama), [OpenAI](https://docs.pipecat.ai/server/services/llm/openai), [OpenRouter](https://docs.pipecat.ai/server/services/llm/openrouter), [Perplexity](https://docs.pipecat.ai/server/services/llm/perplexity), [Qwen](https://docs.pipecat.ai/server/services/llm/qwen), [Together AI](https://docs.pipecat.ai/server/services/llm/together)                                                 |
 | Text-to-Speech      | [AWS](https://docs.pipecat.ai/server/services/tts/aws), [Azure](https://docs.pipecat.ai/server/services/tts/azure), [Cartesia](https://docs.pipecat.ai/server/services/tts/cartesia), [Deepgram](https://docs.pipecat.ai/server/services/tts/deepgram), [ElevenLabs](https://docs.pipecat.ai/server/services/tts/elevenlabs), [FastPitch (NVIDIA)](https://docs.pipecat.ai/server/services/tts/fastpitch), [Fish](https://docs.pipecat.ai/server/services/tts/fish), [Google](https://docs.pipecat.ai/server/services/tts/google), [LMNT](https://docs.pipecat.ai/server/services/tts/lmnt), [MiniMax](https://docs.pipecat.ai/server/services/tts/minimax), [Neuphonic](https://docs.pipecat.ai/server/services/tts/neuphonic), [OpenAI](https://docs.pipecat.ai/server/services/tts/openai), [Piper](https://docs.pipecat.ai/server/services/tts/piper), [PlayHT](https://docs.pipecat.ai/server/services/tts/playht), [Rime](https://docs.pipecat.ai/server/services/tts/rime), [Sarvam](https://docs.pipecat.ai/server/services/tts/sarvam), [XTTS](https://docs.pipecat.ai/server/services/tts/xtts) |
 | Speech-to-Speech    | [AWS Nova Sonic](https://docs.pipecat.ai/server/services/s2s/aws), [Gemini Multimodal Live](https://docs.pipecat.ai/server/services/s2s/gemini), [OpenAI Realtime](https://docs.pipecat.ai/server/services/s2s/openai)                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
diff --git a/docs/api/conf.py b/docs/api/conf.py
index a33caa10c..8de9108f8 100644
--- a/docs/api/conf.py
+++ b/docs/api/conf.py
@@ -75,6 +75,7 @@ autodoc_mock_imports = [
     "openpipe",
     "simli",
     "soundfile",
+    "soniox",
     "pipecat_ai_krisp",
     "pyaudio",
     "_tkinter",
diff --git a/docs/api/requirements.txt b/docs/api/requirements.txt
index a77ff1084..73cda37cc 100644
--- a/docs/api/requirements.txt
+++ b/docs/api/requirements.txt
@@ -45,6 +45,7 @@ pipecat-ai[remote-smart-turn]
 pipecat-ai[silero]
 pipecat-ai[simli]
 pipecat-ai[soundfile]
+pipecat-ai[soniox]
 pipecat-ai[tavus]
 pipecat-ai[together]
 # pipecat-ai[ultravox] # Mocked
diff --git a/dot-env.template b/dot-env.template
index 20d73b3ad..9d2128355 100644
--- a/dot-env.template
+++ b/dot-env.template
@@ -107,4 +107,7 @@ MINIMAX_API_KEY=...
 MINIMAX_GROUP_ID=...
 
 # Sarvam AI
-SARVAM_API_KEY=...
\ No newline at end of file
+SARVAM_API_KEY=...
+
+# Soniox
+SONIOX_API_KEY=
\ No newline at end of file
diff --git a/examples/foundational/07za-interruptible-soniox.py b/examples/foundational/07za-interruptible-soniox.py
new file mode 100644
index 000000000..f3b3487d7
--- /dev/null
+++ b/examples/foundational/07za-interruptible-soniox.py
@@ -0,0 +1,115 @@
+#
+# Copyright (c) 2024–2025, Daily
+#
+# SPDX-License-Identifier: BSD 2-Clause License
+#
+
+import argparse
+import os
+
+from dotenv import load_dotenv
+from loguru import logger
+
+from pipecat.audio.vad.silero import SileroVADAnalyzer
+from pipecat.pipeline.pipeline import Pipeline
+from pipecat.pipeline.runner import PipelineRunner
+from pipecat.pipeline.task import PipelineParams, PipelineTask
+from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
+from pipecat.services.cartesia.tts import CartesiaTTSService
+from pipecat.services.openai.llm import OpenAILLMService
+from pipecat.services.soniox.config import SonioxInputParams
+from pipecat.services.soniox.stt import SonioxSTTService
+from pipecat.transcriptions.language import Language
+from pipecat.transports.base_transport import TransportParams
+from pipecat.transports.network.small_webrtc import SmallWebRTCTransport
+from pipecat.transports.network.webrtc_connection import SmallWebRTCConnection
+
+load_dotenv(override=True)
+
+
+async def run_bot(webrtc_connection: SmallWebRTCConnection, _: argparse.Namespace):
+    logger.info(f"Starting bot")
+
+    transport = SmallWebRTCTransport(
+        webrtc_connection=webrtc_connection,
+        params=TransportParams(
+            audio_in_enabled=True,
+            audio_out_enabled=True,
+            vad_analyzer=SileroVADAnalyzer(),
+        ),
+    )
+
+    stt = SonioxSTTService(
+        api_key=os.getenv("SONIOX_API_KEY"),
+        params=SonioxInputParams(
+            # Add language hints to improve transcription accuracy. Variants are ignored.
+            # For example "en-GB" will be treated same as "en".
+            # List of supported languages: https://soniox.com/docs/speech-to-text/core-concepts/supported-languages
+            language_hints=[Language.EN, Language.ES, Language.JA, Language.ZH],
+        ),
+    )
+
+    tts = CartesiaTTSService(
+        api_key=os.getenv("CARTESIA_API_KEY"),
+        voice_id="71a7ad14-091c-4e8e-a314-022ece01c121",  # British Reading Lady
+    )
+
+    llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))
+
+    messages = [
+        {
+            "role": "system",
+            "content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
+        },
+    ]
+
+    context = OpenAILLMContext(messages)
+    context_aggregator = llm.create_context_aggregator(context)
+
+    pipeline = Pipeline(
+        [
+            transport.input(),  # Transport user input
+            stt,
+            context_aggregator.user(),  # User responses
+            llm,  # LLM
+            tts,  # TTS
+            transport.output(),  # Transport bot output
+            context_aggregator.assistant(),  # Assistant spoken responses
+        ]
+    )
+
+    task = PipelineTask(
+        pipeline,
+        params=PipelineParams(
+            allow_interruptions=True,
+            enable_metrics=True,
+            enable_usage_metrics=True,
+            report_only_initial_ttfb=True,
+        ),
+    )
+
+    @transport.event_handler("on_client_connected")
+    async def on_client_connected(transport, client):
+        logger.info(f"Client connected")
+        # Kick off the conversation.
+        messages.append({"role": "system", "content": "Please introduce yourself to the user."})
+        await task.queue_frames([context_aggregator.user().get_context_frame()])
+
+    @transport.event_handler("on_client_disconnected")
+    async def on_client_disconnected(transport, client):
+        logger.info(f"Client disconnected")
+
+    @transport.event_handler("on_client_closed")
+    async def on_client_closed(transport, client):
+        logger.info(f"Client closed connection")
+        await task.cancel()
+
+    runner = PipelineRunner(handle_sigint=False)
+
+    await runner.run(task)
+
+
+if __name__ == "__main__":
+    from run import main
+
+    main()
diff --git a/examples/foundational/13f-soniox-transcription.py b/examples/foundational/13f-soniox-transcription.py
new file mode 100644
index 000000000..6e9d356f3
--- /dev/null
+++ b/examples/foundational/13f-soniox-transcription.py
@@ -0,0 +1,77 @@
+#
+# Copyright (c) 2024–2025, Daily
+#
+# SPDX-License-Identifier: BSD 2-Clause License
+#
+
+import argparse
+import os
+
+from dotenv import load_dotenv
+from loguru import logger
+
+from pipecat.frames.frames import Frame, TranscriptionFrame
+from pipecat.pipeline.pipeline import Pipeline
+from pipecat.pipeline.runner import PipelineRunner
+from pipecat.pipeline.task import PipelineTask
+from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
+from pipecat.services.soniox.config import SonioxInputParams
+from pipecat.services.soniox.stt import SonioxSTTService
+from pipecat.transcriptions.language import Language
+from pipecat.transports.base_transport import TransportParams
+from pipecat.transports.network.small_webrtc import SmallWebRTCTransport
+from pipecat.transports.network.webrtc_connection import SmallWebRTCConnection
+
+load_dotenv(override=True)
+
+
+class TranscriptionLogger(FrameProcessor):
+    async def process_frame(self, frame: Frame, direction: FrameDirection):
+        await super().process_frame(frame, direction)
+
+        if isinstance(frame, TranscriptionFrame):
+            print(f"Transcription: {frame.text}")
+
+
+async def run_bot(webrtc_connection: SmallWebRTCConnection, _: argparse.Namespace):
+    logger.info(f"Starting bot")
+
+    transport = SmallWebRTCTransport(
+        webrtc_connection=webrtc_connection,
+        params=TransportParams(audio_in_enabled=True),
+    )
+
+    stt = SonioxSTTService(
+        api_key=os.getenv("SONIOX_API_KEY"),
+        params=SonioxInputParams(
+            # Add language hints to improve transcription accuracy. Variants are ignored.
+            # For example "en-GB" will be treated same as "en".
+            # List of supported languages: https://soniox.com/docs/speech-to-text/core-concepts/supported-languages
+            language_hints=[Language.EN, Language.ES, Language.JA, Language.ZH],
+        ),
+    )
+
+    tl = TranscriptionLogger()
+
+    pipeline = Pipeline([transport.input(), stt, tl])
+
+    task = PipelineTask(pipeline)
+
+    @transport.event_handler("on_client_disconnected")
+    async def on_client_disconnected(transport, client):
+        logger.info(f"Client disconnected")
+
+    @transport.event_handler("on_client_closed")
+    async def on_client_closed(transport, client):
+        logger.info(f"Client closed connection")
+        await task.cancel()
+
+    runner = PipelineRunner(handle_sigint=False)
+
+    await runner.run(task)
+
+
+if __name__ == "__main__":
+    from run import main
+
+    main()
diff --git a/pyproject.toml b/pyproject.toml
index a2e50906d..a89cee5f8 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -84,6 +84,7 @@ local-smart-turn = [ "coremltools>=8.0", "transformers", "torch==2.5.0", "torcha
 remote-smart-turn = []
 silero = [ "onnxruntime~=1.20.1" ]
 simli = [ "simli-ai~=0.1.10"]
+soniox = [ "websockets~=13.1" ]
 soundfile = [ "soundfile~=0.13.0" ]
 tavus=[]
 together = []
diff --git a/src/pipecat/services/soniox/__init__.py b/src/pipecat/services/soniox/__init__.py
new file mode 100644
index 000000000..c74b1c218
--- /dev/null
+++ b/src/pipecat/services/soniox/__init__.py
@@ -0,0 +1,13 @@
+#
+# Copyright (c) 2024–2025, Daily
+#
+# SPDX-License-Identifier: BSD 2-Clause License
+#
+
+import sys
+
+from pipecat.services import DeprecatedModuleProxy
+
+from .stt import *
+
+sys.modules[__name__] = DeprecatedModuleProxy(globals(), "soniox", "soniox.stt")
diff --git a/src/pipecat/services/soniox/config.py b/src/pipecat/services/soniox/config.py
new file mode 100644
index 000000000..862b573f9
--- /dev/null
+++ b/src/pipecat/services/soniox/config.py
@@ -0,0 +1,36 @@
+#
+# Copyright (c) 2024–2025, Daily
+#
+# SPDX-License-Identifier: BSD 2-Clause License
+#
+
+from typing import List, Optional
+
+from pydantic import BaseModel
+
+from pipecat.transcriptions.language import Language
+
+
+class SonioxInputParams(BaseModel):
+    """Real-time transcription settings.
+
+    Attributes:
+        languages: List of language codes to use for transcription
+        code_switching: Whether to auto-detect language changes during transcription
+    """
+
+    model: str = "stt-rt-preview"
+
+    audio_format: Optional[str] = "pcm_s16le"
+    num_channels: Optional[int] = 1
+    sample_rate: Optional[int] = 16000
+
+    language_hints: Optional[List[Language]] = None
+    context: Optional[str] = None
+
+    enable_non_final_tokens: Optional[bool] = True
+    max_non_final_tokens_duration_ms: Optional[int] = None
+
+    enable_endpoint_detection: Optional[bool] = True
+
+    client_reference_id: Optional[str] = None
diff --git a/src/pipecat/services/soniox/stt.py b/src/pipecat/services/soniox/stt.py
new file mode 100644
index 000000000..7d45c5a73
--- /dev/null
+++ b/src/pipecat/services/soniox/stt.py
@@ -0,0 +1,368 @@
+#
+# Copyright (c) 2024–2025, Daily
+#
+# SPDX-License-Identifier: BSD 2-Clause License
+#
+
+import asyncio
+import json
+import time
+from typing import AsyncGenerator, List, Optional
+
+from loguru import logger
+
+from pipecat.frames.frames import (
+    CancelFrame,
+    EndFrame,
+    ErrorFrame,
+    Frame,
+    InterimTranscriptionFrame,
+    StartFrame,
+    TranscriptionFrame,
+    UserStoppedSpeakingFrame,
+)
+from pipecat.processors.frame_processor import FrameDirection
+from pipecat.services.soniox.config import SonioxInputParams
+from pipecat.services.stt_service import STTService
+from pipecat.transcriptions.language import Language
+from pipecat.utils.time import time_now_iso8601
+
+try:
+    import websockets
+except ModuleNotFoundError as e:
+    logger.error(f"Exception: {e}")
+    logger.error("In order to use Soniox, you need to `pip install pipecat-ai[soniox]`.")
+    raise Exception(f"Missing module: {e}")
+
+
+KEEPALIVE_MESSAGE = json.dumps(
+    {
+        "type": "keepalive",
+    }
+)
+
+FINALIZE_MESSAGE = json.dumps(
+    {
+        "type": "finalize",
+    }
+)
+
+END_TOKEN = "<end>"
+
+FINALIZED_TOKEN = "<fin>"
+
+
+def is_end_token(token: dict) -> bool:
+    return token["text"] == END_TOKEN or token["text"] == FINALIZED_TOKEN
+
+
+def language_to_soniox_language(language: Language) -> str:
+    """Pipecat Language enum uses same ISO 2-letter codes as Soniox, except with added regional variants.
+
+    For a list of all supported languages, see: https://soniox.com/docs/speech-to-text/core-concepts/supported-languages
+    """
+    lang_str = str(language.value).lower()
+    if "-" in lang_str:
+        return lang_str.split("-")[0]
+    return lang_str
+
+
+def _prepare_language_hints(
+    language_hints: Optional[List[Language]],
+) -> Optional[List[str]]:
+    if language_hints is None:
+        return None
+
+    prepared_languages = [language_to_soniox_language(lang) for lang in language_hints]
+    # Remove duplicates (in case of language_hints with multiple regions)
+    return list(set(prepared_languages))
+
+
+class SonioxSTTService(STTService):
+    """Speech-to-Text service using Soniox's WebSocket API.
+
+    This service connects to Soniox's WebSocket API for real-time transcription
+    with support for multiple languages, custom context, speaker diarization,
+    and more.
+
+    For complete API documentation, see: https://soniox.com/docs/speech-to-text/api-reference/websocket-api
+    """
+
+    def __init__(
+        self,
+        *,
+        api_key: str,
+        url: str = "wss://stt-rt.soniox.com/transcribe-websocket",
+        sample_rate: Optional[int] = None,
+        params: SonioxInputParams = SonioxInputParams(),
+        enable_vad: bool = True,
+        auto_finalize_delay_ms: int | None = 3000,
+        **kwargs,
+    ):
+        """Initialize the Soniox STT service.
+
+        Args:
+            api_key: Soniox API key
+            url: Soniox WebSocket API URL
+            model: Transcription model to use.
+            params: Additional configuration parameters, such as language hints, context and
+                speaker diarization.
+            enable_vad: Listen to `UserStoppedSpeakingFrame` to send finalize message to Soniox.
+            auto_finalize_delay: If no new tokens are received for a while and there is active
+                transcription (only InterimTranscriptionFrame), finalize the transcription by
+                sending the finalize message so user can receive the final transcript. If set
+                to `None`, the auto finalize feature is disabled.
+            **kwargs: Additional arguments passed to the STTService
+        """
+        sample_rate = sample_rate or (params.sample_rate if params.sample_rate else None)
+        super().__init__(sample_rate=sample_rate, **kwargs)
+
+        self._api_key = api_key
+        self._url = url
+        self.set_model_name(params.model)
+        self._params = params
+        self._enable_vad = enable_vad
+        self._auto_finalize_delay_ms = auto_finalize_delay_ms
+        self._websocket = None
+
+        self._final_transcription_buffer = ""
+        self._last_tokens_received: float | None = None
+
+        self._receive_task = None
+        self._keepalive_task = None
+        self._finalize_if_no_tokens_task = None
+
+    async def start(self, frame: StartFrame):
+        """Start the Soniox STT websocket connection."""
+        await super().start(frame)
+        if self._websocket:
+            return
+
+        self._websocket = await websockets.connect(self._url)
+
+        if not self._websocket:
+            logger.error(f"Unable to connect to Soniox API at {self._url}")
+
+        # Send the initial configuration message
+        config = {
+            "api_key": self._api_key,
+            "model": self._model_name,
+            "audio_format": self._params.audio_format,
+            "num_channels": self._params.num_channels or 1,
+            "enable_endpoint_detection": self._params.enable_endpoint_detection,
+            "sample_rate": self._sample_rate,
+            "language_hints": _prepare_language_hints(self._params.language_hints),
+            "context": self._params.context,
+            "enable_non_final_tokens": self._params.enable_non_final_tokens,
+            "max_non_final_tokens_duration_ms": self._params.max_non_final_tokens_duration_ms,
+            "client_reference_id": self._params.client_reference_id,
+        }
+
+        # Send the configuration message
+        await self._websocket.send(json.dumps(config))
+
+        if self._websocket and not self._receive_task:
+            self._receive_task = self.create_task(self._receive_task_handler())
+        if self._websocket and not self._keepalive_task:
+            self._keepalive_task = self.create_task(self._keepalive_task_handler())
+        if (
+            self._websocket
+            and not self._finalize_if_no_tokens_task
+            and self._auto_finalize_delay_ms is not None
+        ):
+            self._finalize_if_no_tokens_task = self.create_task(
+                self._finalize_if_no_tokens_task_handler()
+            )
+
+    async def _cleanup(self):
+        if self._keepalive_task:
+            await self.cancel_task(self._keepalive_task)
+            self._keepalive_task = None
+
+        if self._websocket:
+            await self._websocket.close()
+            self._websocket = None
+
+        if self._receive_task:
+            await self.wait_for_task(self._receive_task)
+            self._receive_task = None
+
+        if self._finalize_if_no_tokens_task:
+            await self.cancel_task(self._finalize_if_no_tokens_task)
+            self._finalize_if_no_tokens_task = None
+
+    async def stop(self, frame: EndFrame):
+        """Stop the Soniox STT websocket connection.
+
+        Stopping waits for the server to close the connection as we might receive
+        additional final tokens after sending the stop recording message.
+        """
+        await super().stop(frame)
+        await self._send_stop_recording()
+
+    async def cancel(self, frame: CancelFrame):
+        """Cancel the Soniox STT websocket connection.
+
+        Compared to stop, this method closes the connection immediately without waiting
+        for the server to close it. This is useful when we want to stop the connection
+        immediately without waiting for the server to send any final tokens.
+        """
+        await super().cancel(frame)
+        await self._cleanup()
+
+    async def run_stt(self, audio: bytes) -> AsyncGenerator[Frame, None]:
+        """Send audio data to Soniox STT Service."""
+        await self.start_processing_metrics()
+        if self._websocket and not self._websocket.closed:
+            await self._websocket.send(audio)
+        await self.stop_processing_metrics()
+
+        yield None
+
+    async def process_frame(self, frame: Frame, direction: FrameDirection):
+        """Processes a frame of audio data, either buffering or transcribing it."""
+        await super().process_frame(frame, direction)
+
+        if isinstance(frame, UserStoppedSpeakingFrame) and self._enable_vad:
+            # Send finalize message to Soniox so we get the final tokens asap.
+            if self._websocket and not self._websocket.closed:
+                await self._websocket.send(FINALIZE_MESSAGE)
+                logger.debug(f"Triggered finalize event on: {frame.name=}, {direction=}")
+
+    async def _send_stop_recording(self):
+        if self._websocket and not self._websocket.closed:
+            # Send stop recording message
+            await self._websocket.send("")
+
+    async def _keepalive_task_handler(self):
+        """Connection has to be open all the time."""
+        try:
+            while True:
+                logger.debug("Sending keepalive message")
+                if self._websocket and not self._websocket.closed:
+                    await self._websocket.send(KEEPALIVE_MESSAGE)
+                else:
+                    logger.debug("WebSocket connection closed.")
+                    break
+                await asyncio.sleep(5)
+
+        except websockets.exceptions.ConnectionClosed:
+            # Expected when closing the connection
+            logger.debug("WebSocket connection closed, keepalive task stopped.")
+        except Exception as e:
+            logger.error(f"{self} error (_keepalive_task_handler): {e}")
+            await self.push_error(ErrorFrame(f"{self} error (_keepalive_task_handler): {e}"))
+
+    async def _receive_task_handler(self):
+        if not self._websocket:
+            return
+
+        # Transcription frame will be only sent after we get the "endpoint" event.
+        self._final_transcription_buffer = ""
+
+        async def send_endpoint_transcript():
+            if self._final_transcription_buffer:
+                await self.push_frame(
+                    TranscriptionFrame(
+                        self._final_transcription_buffer,
+                        "",
+                        time_now_iso8601(),
+                    )
+                )
+                self._final_transcription_buffer = ""
+
+        try:
+            async for message in self._websocket:
+                content = json.loads(message)
+
+                tokens = content["tokens"]
+
+                if tokens:
+                    # Got at least one token, so we can reset the auto finalize delay
+                    self._last_tokens_received = time.time()
+
+                # We will only send the final tokens after we get the "endpoint" event
+                non_final_transcription = ""
+
+                for token in tokens:
+                    if token["is_final"]:
+                        if is_end_token(token):
+                            # Found an endpoint, tokens until here will be sent as transcript,
+                            # the rest will be sent as interim tokens (even final tokens).
+                            await send_endpoint_transcript()
+                        else:
+                            self._final_transcription_buffer += token["text"]
+                    else:
+                        non_final_transcription += token["text"]
+
+                if self._final_transcription_buffer or non_final_transcription:
+                    await self.push_frame(
+                        InterimTranscriptionFrame(
+                            # Even final tokens are sent as interim tokens as we want to send
+                            # nicely formatted messages - therefore waiting for the endpoint.
+                            self._final_transcription_buffer + non_final_transcription,
+                            "",
+                            time_now_iso8601(),
+                        )
+                    )
+
+                error_code = content.get("error_code")
+                error_message = content.get("error_message")
+                if error_code or error_message:
+                    # In case of error, still send the final transcript (if any remaining in the buffer)
+                    await send_endpoint_transcript()
+                    logger.error(
+                        f"{self} error: {error_code} (_receive_task_handler) - {error_message}"
+                    )
+                    await self.push_error(
+                        ErrorFrame(
+                            f"{self} error: {error_code} (_receive_task_handler) - {error_message}"
+                        )
+                    )
+
+                finished = content.get("finished")
+                if finished:
+                    # When finished, still send the final transcript (if any remaining in the buffer)
+                    await send_endpoint_transcript()
+                    logger.debug("Transcription finished.")
+                    await self._cleanup()
+
+        except websockets.exceptions.ConnectionClosed:
+            # Expected when closing the connection
+            pass
+        except Exception as e:
+            logger.error(f"{self} error: {e}")
+            await self.push_error(ErrorFrame(f"{self} error: {e}"))
+
+    async def _finalize_if_no_tokens_task_handler(self):
+        """Call finalize if no new tokens are received for a configured duration."""
+        if not self._websocket or self._websocket.closed or self._auto_finalize_delay_ms is None:
+            return
+
+        try:
+            while True:
+                await asyncio.sleep(0.5)
+
+                if not self._websocket or self._websocket.closed:
+                    break
+
+                # Check if we have anything to send
+                if not self._final_transcription_buffer:
+                    continue
+
+                # Check if enough time has passed since the last tokens were received
+                if self._last_tokens_received:
+                    last_token_age_ms = (time.time() - self._last_tokens_received) * 1000
+
+                    if last_token_age_ms > self._auto_finalize_delay_ms:
+                        # No new tokens received for a while, finalize the transcription
+                        logger.debug("No pending frames, sending finalize message")
+                        await self._websocket.send(FINALIZE_MESSAGE)
+        except websockets.exceptions.ConnectionClosed:
+            # Expected when closing the connection
+            pass
+        except Exception as e:
+            logger.error(f"{self} error (_finalize_if_no_tokens_task_handler): {e}")
+            await self.push_error(
+                ErrorFrame(f"{self} error (_finalize_if_no_tokens_task_handler): {e}")
+            )

From 95fe7627769e66f79dbcba8b5d0434c04e3728c0 Mon Sep 17 00:00:00 2001
From: Matej Marinko <matej.marinko@soniox.com>
Date: Thu, 29 May 2025 09:23:37 +0200
Subject: [PATCH 02/18] Fix typo

---
 src/pipecat/services/soniox/stt.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/pipecat/services/soniox/stt.py b/src/pipecat/services/soniox/stt.py
index 7d45c5a73..11ca717bf 100644
--- a/src/pipecat/services/soniox/stt.py
+++ b/src/pipecat/services/soniox/stt.py
@@ -108,7 +108,7 @@ class SonioxSTTService(STTService):
             params: Additional configuration parameters, such as language hints, context and
                 speaker diarization.
             enable_vad: Listen to `UserStoppedSpeakingFrame` to send finalize message to Soniox.
-            auto_finalize_delay: If no new tokens are received for a while and there is active
+            auto_finalize_delay_ms: If no new tokens are received for a while and there is active
                 transcription (only InterimTranscriptionFrame), finalize the transcription by
                 sending the finalize message so user can receive the final transcript. If set
                 to `None`, the auto finalize feature is disabled.

From 51b79bd6a1bea2599dd2f40c6fe276853792f193 Mon Sep 17 00:00:00 2001
From: Matej Marinko <matej.marinko@soniox.com>
Date: Thu, 29 May 2025 10:11:11 +0200
Subject: [PATCH 03/18] Minor code style changes

---
 src/pipecat/services/soniox/stt.py | 46 ++++++++++++------------------
 1 file changed, 19 insertions(+), 27 deletions(-)

diff --git a/src/pipecat/services/soniox/stt.py b/src/pipecat/services/soniox/stt.py
index 11ca717bf..891179259 100644
--- a/src/pipecat/services/soniox/stt.py
+++ b/src/pipecat/services/soniox/stt.py
@@ -35,17 +35,9 @@ except ModuleNotFoundError as e:
     raise Exception(f"Missing module: {e}")
 
 
-KEEPALIVE_MESSAGE = json.dumps(
-    {
-        "type": "keepalive",
-    }
-)
+KEEPALIVE_MESSAGE = '{"type": "keepalive"}'
 
-FINALIZE_MESSAGE = json.dumps(
-    {
-        "type": "finalize",
-    }
-)
+FINALIZE_MESSAGE = '{"type": "finalize"}'
 
 END_TOKEN = "<end>"
 
@@ -74,7 +66,7 @@ def _prepare_language_hints(
         return None
 
     prepared_languages = [language_to_soniox_language(lang) for lang in language_hints]
-    # Remove duplicates (in case of language_hints with multiple regions)
+    # Remove duplicates (in case of language_hints with multiple regions).
     return list(set(prepared_languages))
 
 
@@ -96,14 +88,14 @@ class SonioxSTTService(STTService):
         sample_rate: Optional[int] = None,
         params: SonioxInputParams = SonioxInputParams(),
         enable_vad: bool = True,
-        auto_finalize_delay_ms: int | None = 3000,
+        auto_finalize_delay_ms: Optional[int] = 3000,
         **kwargs,
     ):
         """Initialize the Soniox STT service.
 
         Args:
-            api_key: Soniox API key
-            url: Soniox WebSocket API URL
+            api_key: Soniox API key.
+            url: Soniox WebSocket API URL.
             model: Transcription model to use.
             params: Additional configuration parameters, such as language hints, context and
                 speaker diarization.
@@ -112,7 +104,7 @@ class SonioxSTTService(STTService):
                 transcription (only InterimTranscriptionFrame), finalize the transcription by
                 sending the finalize message so user can receive the final transcript. If set
                 to `None`, the auto finalize feature is disabled.
-            **kwargs: Additional arguments passed to the STTService
+            **kwargs: Additional arguments passed to the STTService.
         """
         sample_rate = sample_rate or (params.sample_rate if params.sample_rate else None)
         super().__init__(sample_rate=sample_rate, **kwargs)
@@ -126,7 +118,7 @@ class SonioxSTTService(STTService):
         self._websocket = None
 
         self._final_transcription_buffer = ""
-        self._last_tokens_received: float | None = None
+        self._last_tokens_received: Optional[float] = None
 
         self._receive_task = None
         self._keepalive_task = None
@@ -143,7 +135,7 @@ class SonioxSTTService(STTService):
         if not self._websocket:
             logger.error(f"Unable to connect to Soniox API at {self._url}")
 
-        # Send the initial configuration message
+        # Send the initial configuration message.
         config = {
             "api_key": self._api_key,
             "model": self._model_name,
@@ -158,7 +150,7 @@ class SonioxSTTService(STTService):
             "client_reference_id": self._params.client_reference_id,
         }
 
-        # Send the configuration message
+        # Send the configuration message.
         await self._websocket.send(json.dumps(config))
 
         if self._websocket and not self._receive_task:
@@ -278,10 +270,10 @@ class SonioxSTTService(STTService):
                 tokens = content["tokens"]
 
                 if tokens:
-                    # Got at least one token, so we can reset the auto finalize delay
+                    # Got at least one token, so we can reset the auto finalize delay.
                     self._last_tokens_received = time.time()
 
-                # We will only send the final tokens after we get the "endpoint" event
+                # We will only send the final tokens after we get the "endpoint" event.
                 non_final_transcription = ""
 
                 for token in tokens:
@@ -309,7 +301,7 @@ class SonioxSTTService(STTService):
                 error_code = content.get("error_code")
                 error_message = content.get("error_message")
                 if error_code or error_message:
-                    # In case of error, still send the final transcript (if any remaining in the buffer)
+                    # In case of error, still send the final transcript (if any remaining in the buffer).
                     await send_endpoint_transcript()
                     logger.error(
                         f"{self} error: {error_code} (_receive_task_handler) - {error_message}"
@@ -322,13 +314,13 @@ class SonioxSTTService(STTService):
 
                 finished = content.get("finished")
                 if finished:
-                    # When finished, still send the final transcript (if any remaining in the buffer)
+                    # When finished, still send the final transcript (if any remaining in the buffer).
                     await send_endpoint_transcript()
                     logger.debug("Transcription finished.")
                     await self._cleanup()
 
         except websockets.exceptions.ConnectionClosed:
-            # Expected when closing the connection
+            # Expected when closing the connection.
             pass
         except Exception as e:
             logger.error(f"{self} error: {e}")
@@ -346,20 +338,20 @@ class SonioxSTTService(STTService):
                 if not self._websocket or self._websocket.closed:
                     break
 
-                # Check if we have anything to send
+                # Check if we have anything to send.
                 if not self._final_transcription_buffer:
                     continue
 
-                # Check if enough time has passed since the last tokens were received
+                # Check if enough time has passed since the last tokens were received.
                 if self._last_tokens_received:
                     last_token_age_ms = (time.time() - self._last_tokens_received) * 1000
 
                     if last_token_age_ms > self._auto_finalize_delay_ms:
-                        # No new tokens received for a while, finalize the transcription
+                        # No new tokens received for a while, finalize the transcription.
                         logger.debug("No pending frames, sending finalize message")
                         await self._websocket.send(FINALIZE_MESSAGE)
         except websockets.exceptions.ConnectionClosed:
-            # Expected when closing the connection
+            # Expected when closing the connection.
             pass
         except Exception as e:
             logger.error(f"{self} error (_finalize_if_no_tokens_task_handler): {e}")

From db7b60cfe9dfc021298df9d576664ac1623540f4 Mon Sep 17 00:00:00 2001
From: Matej Marinko <matej.marinko@soniox.com>
Date: Thu, 29 May 2025 13:24:53 +0200
Subject: [PATCH 04/18] Auto finalize fix

---
 src/pipecat/services/soniox/stt.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/pipecat/services/soniox/stt.py b/src/pipecat/services/soniox/stt.py
index 891179259..fbffd9c19 100644
--- a/src/pipecat/services/soniox/stt.py
+++ b/src/pipecat/services/soniox/stt.py
@@ -349,6 +349,7 @@ class SonioxSTTService(STTService):
                     if last_token_age_ms > self._auto_finalize_delay_ms:
                         # No new tokens received for a while, finalize the transcription.
                         logger.debug("No pending frames, sending finalize message")
+                        self._last_tokens_received = None
                         await self._websocket.send(FINALIZE_MESSAGE)
         except websockets.exceptions.ConnectionClosed:
             # Expected when closing the connection.

From ee5fea422138d45d8331dc8c9395e84a805e0060 Mon Sep 17 00:00:00 2001
From: Matej Marinko <matej.marinko@soniox.com>
Date: Thu, 29 May 2025 14:58:35 +0200
Subject: [PATCH 05/18] Fix auto finalization cycle

---
 src/pipecat/services/soniox/stt.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/pipecat/services/soniox/stt.py b/src/pipecat/services/soniox/stt.py
index fbffd9c19..dd425bcbf 100644
--- a/src/pipecat/services/soniox/stt.py
+++ b/src/pipecat/services/soniox/stt.py
@@ -45,6 +45,7 @@ FINALIZED_TOKEN = "<fin>"
 
 
 def is_end_token(token: dict) -> bool:
+    """Determine if a token is an end token."""
     return token["text"] == END_TOKEN or token["text"] == FINALIZED_TOKEN
 
 
@@ -270,8 +271,12 @@ class SonioxSTTService(STTService):
                 tokens = content["tokens"]
 
                 if tokens:
-                    # Got at least one token, so we can reset the auto finalize delay.
-                    self._last_tokens_received = time.time()
+                    if len(tokens) == 1 and tokens[0]["text"] == FINALIZED_TOKEN:
+                        # Ignore finalized token, prevent auto-finalize cycling.
+                        pass
+                    else:
+                        # Got at least one token, so we can reset the auto finalize delay.
+                        self._last_tokens_received = time.time()
 
                 # We will only send the final tokens after we get the "endpoint" event.
                 non_final_transcription = ""

From c54084b7a413dd7b38c099764264a695d9482a9a Mon Sep 17 00:00:00 2001
From: Matej Marinko <matej.marinko@soniox.com>
Date: Mon, 23 Jun 2025 14:18:29 +0200
Subject: [PATCH 06/18] Fix deadlock on STT service stop

---
 src/pipecat/services/soniox/stt.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/src/pipecat/services/soniox/stt.py b/src/pipecat/services/soniox/stt.py
index dd425bcbf..b119a4265 100644
--- a/src/pipecat/services/soniox/stt.py
+++ b/src/pipecat/services/soniox/stt.py
@@ -176,14 +176,16 @@ class SonioxSTTService(STTService):
             await self._websocket.close()
             self._websocket = None
 
-        if self._receive_task:
-            await self.wait_for_task(self._receive_task)
-            self._receive_task = None
-
         if self._finalize_if_no_tokens_task:
             await self.cancel_task(self._finalize_if_no_tokens_task)
             self._finalize_if_no_tokens_task = None
 
+        if self._receive_task:
+            # Task cannot cancel itself. If task called _cleanup() we expect it to cancel itself.
+            if self._receive_task != asyncio.current_task():
+                await self.wait_for_task(self._receive_task)
+            self._receive_task = None
+
     async def stop(self, frame: EndFrame):
         """Stop the Soniox STT websocket connection.
 
@@ -323,6 +325,7 @@ class SonioxSTTService(STTService):
                     await send_endpoint_transcript()
                     logger.debug("Transcription finished.")
                     await self._cleanup()
+                    return
 
         except websockets.exceptions.ConnectionClosed:
             # Expected when closing the connection.

From dc47516e14d6c82708db4077c83923d1103fc88f Mon Sep 17 00:00:00 2001
From: matejmarinko-soniox <matej.marinko@soniox.com>
Date: Wed, 9 Jul 2025 08:04:59 +0200
Subject: [PATCH 07/18] Update src/pipecat/services/soniox/config.py

Co-authored-by: Mark Backman <m.backman@gmail.com>
---
 src/pipecat/services/soniox/config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/pipecat/services/soniox/config.py b/src/pipecat/services/soniox/config.py
index 862b573f9..2980a9e81 100644
--- a/src/pipecat/services/soniox/config.py
+++ b/src/pipecat/services/soniox/config.py
@@ -14,7 +14,7 @@ from pipecat.transcriptions.language import Language
 class SonioxInputParams(BaseModel):
     """Real-time transcription settings.
 
-    Attributes:
+    Parameters:
         languages: List of language codes to use for transcription
         code_switching: Whether to auto-detect language changes during transcription
     """

From 8daaea5969c0a314d2104b24a83b465ea3ea228c Mon Sep 17 00:00:00 2001
From: Matej Marinko <matej.marinko@soniox.com>
Date: Wed, 9 Jul 2025 09:03:02 +0200
Subject: [PATCH 08/18] Minor code cleanup

---
 src/pipecat/services/soniox/__init__.py | 13 ------
 src/pipecat/services/soniox/stt.py      | 55 +++----------------------
 2 files changed, 5 insertions(+), 63 deletions(-)

diff --git a/src/pipecat/services/soniox/__init__.py b/src/pipecat/services/soniox/__init__.py
index c74b1c218..e69de29bb 100644
--- a/src/pipecat/services/soniox/__init__.py
+++ b/src/pipecat/services/soniox/__init__.py
@@ -1,13 +0,0 @@
-#
-# Copyright (c) 2024–2025, Daily
-#
-# SPDX-License-Identifier: BSD 2-Clause License
-#
-
-import sys
-
-from pipecat.services import DeprecatedModuleProxy
-
-from .stt import *
-
-sys.modules[__name__] = DeprecatedModuleProxy(globals(), "soniox", "soniox.stt")
diff --git a/src/pipecat/services/soniox/stt.py b/src/pipecat/services/soniox/stt.py
index b119a4265..e598422a9 100644
--- a/src/pipecat/services/soniox/stt.py
+++ b/src/pipecat/services/soniox/stt.py
@@ -87,7 +87,7 @@ class SonioxSTTService(STTService):
         api_key: str,
         url: str = "wss://stt-rt.soniox.com/transcribe-websocket",
         sample_rate: Optional[int] = None,
-        params: SonioxInputParams = SonioxInputParams(),
+        params: Optional[SonioxInputParams] = None,
         enable_vad: bool = True,
         auto_finalize_delay_ms: Optional[int] = 3000,
         **kwargs,
@@ -98,6 +98,7 @@ class SonioxSTTService(STTService):
             api_key: Soniox API key.
             url: Soniox WebSocket API URL.
             model: Transcription model to use.
+            sample_rate: Audio sample rate. If None, uses value from `params`.
             params: Additional configuration parameters, such as language hints, context and
                 speaker diarization.
             enable_vad: Listen to `UserStoppedSpeakingFrame` to send finalize message to Soniox.
@@ -109,6 +110,7 @@ class SonioxSTTService(STTService):
         """
         sample_rate = sample_rate or (params.sample_rate if params.sample_rate else None)
         super().__init__(sample_rate=sample_rate, **kwargs)
+        params = params or SonioxInputParams()
 
         self._api_key = api_key
         self._url = url
@@ -123,7 +125,6 @@ class SonioxSTTService(STTService):
 
         self._receive_task = None
         self._keepalive_task = None
-        self._finalize_if_no_tokens_task = None
 
     async def start(self, frame: StartFrame):
         """Start the Soniox STT websocket connection."""
@@ -158,14 +159,6 @@ class SonioxSTTService(STTService):
             self._receive_task = self.create_task(self._receive_task_handler())
         if self._websocket and not self._keepalive_task:
             self._keepalive_task = self.create_task(self._keepalive_task_handler())
-        if (
-            self._websocket
-            and not self._finalize_if_no_tokens_task
-            and self._auto_finalize_delay_ms is not None
-        ):
-            self._finalize_if_no_tokens_task = self.create_task(
-                self._finalize_if_no_tokens_task_handler()
-            )
 
     async def _cleanup(self):
         if self._keepalive_task:
@@ -176,10 +169,6 @@ class SonioxSTTService(STTService):
             await self._websocket.close()
             self._websocket = None
 
-        if self._finalize_if_no_tokens_task:
-            await self.cancel_task(self._finalize_if_no_tokens_task)
-            self._finalize_if_no_tokens_task = None
-
         if self._receive_task:
             # Task cannot cancel itself. If task called _cleanup() we expect it to cancel itself.
             if self._receive_task != asyncio.current_task():
@@ -260,7 +249,7 @@ class SonioxSTTService(STTService):
                 await self.push_frame(
                     TranscriptionFrame(
                         self._final_transcription_buffer,
-                        "",
+                        self._user_id,
                         time_now_iso8601(),
                     )
                 )
@@ -300,7 +289,7 @@ class SonioxSTTService(STTService):
                             # Even final tokens are sent as interim tokens as we want to send
                             # nicely formatted messages - therefore waiting for the endpoint.
                             self._final_transcription_buffer + non_final_transcription,
-                            "",
+                            self._user_id,
                             time_now_iso8601(),
                         )
                     )
@@ -333,37 +322,3 @@ class SonioxSTTService(STTService):
         except Exception as e:
             logger.error(f"{self} error: {e}")
             await self.push_error(ErrorFrame(f"{self} error: {e}"))
-
-    async def _finalize_if_no_tokens_task_handler(self):
-        """Call finalize if no new tokens are received for a configured duration."""
-        if not self._websocket or self._websocket.closed or self._auto_finalize_delay_ms is None:
-            return
-
-        try:
-            while True:
-                await asyncio.sleep(0.5)
-
-                if not self._websocket or self._websocket.closed:
-                    break
-
-                # Check if we have anything to send.
-                if not self._final_transcription_buffer:
-                    continue
-
-                # Check if enough time has passed since the last tokens were received.
-                if self._last_tokens_received:
-                    last_token_age_ms = (time.time() - self._last_tokens_received) * 1000
-
-                    if last_token_age_ms > self._auto_finalize_delay_ms:
-                        # No new tokens received for a while, finalize the transcription.
-                        logger.debug("No pending frames, sending finalize message")
-                        self._last_tokens_received = None
-                        await self._websocket.send(FINALIZE_MESSAGE)
-        except websockets.exceptions.ConnectionClosed:
-            # Expected when closing the connection.
-            pass
-        except Exception as e:
-            logger.error(f"{self} error (_finalize_if_no_tokens_task_handler): {e}")
-            await self.push_error(
-                ErrorFrame(f"{self} error (_finalize_if_no_tokens_task_handler): {e}")
-            )

From 3cdaeb719a0c8c480e9fab7dd3d5876f351cdcde Mon Sep 17 00:00:00 2001
From: Matej Marinko <matej.marinko@soniox.com>
Date: Wed, 9 Jul 2025 09:28:43 +0200
Subject: [PATCH 09/18] Update examples to new format

---
 .../foundational/07za-interruptible-soniox.py | 54 +++++++++----------
 ...ription.py => 13i-soniox-transcription.py} | 38 +++++++------
 src/pipecat/services/soniox/stt.py            |  4 +-
 3 files changed, 50 insertions(+), 46 deletions(-)
 rename examples/foundational/{13f-soniox-transcription.py => 13i-soniox-transcription.py} (67%)

diff --git a/examples/foundational/07za-interruptible-soniox.py b/examples/foundational/07za-interruptible-soniox.py
index f3b3487d7..d909d5e1d 100644
--- a/examples/foundational/07za-interruptible-soniox.py
+++ b/examples/foundational/07za-interruptible-soniox.py
@@ -20,33 +20,36 @@ from pipecat.services.openai.llm import OpenAILLMService
 from pipecat.services.soniox.config import SonioxInputParams
 from pipecat.services.soniox.stt import SonioxSTTService
 from pipecat.transcriptions.language import Language
-from pipecat.transports.base_transport import TransportParams
-from pipecat.transports.network.small_webrtc import SmallWebRTCTransport
-from pipecat.transports.network.webrtc_connection import SmallWebRTCConnection
+from pipecat.transports.base_transport import BaseTransport, TransportParams
+from pipecat.transports.network.fastapi_websocket import FastAPIWebsocketParams
+from pipecat.transports.services.daily import DailyParams
 
 load_dotenv(override=True)
 
+transport_params = {
+    "daily": lambda: DailyParams(
+        audio_in_enabled=True,
+        audio_out_enabled=True,
+        vad_analyzer=SileroVADAnalyzer(),
+    ),
+    "twilio": lambda: FastAPIWebsocketParams(
+        audio_in_enabled=True,
+        audio_out_enabled=True,
+        vad_analyzer=SileroVADAnalyzer(),
+    ),
+    "webrtc": lambda: TransportParams(
+        audio_in_enabled=True,
+        audio_out_enabled=True,
+        vad_analyzer=SileroVADAnalyzer(),
+    ),
+}
 
-async def run_bot(webrtc_connection: SmallWebRTCConnection, _: argparse.Namespace):
+
+async def run_example(transport: BaseTransport, _: argparse.Namespace, handle_sigint: bool):
     logger.info(f"Starting bot")
 
-    transport = SmallWebRTCTransport(
-        webrtc_connection=webrtc_connection,
-        params=TransportParams(
-            audio_in_enabled=True,
-            audio_out_enabled=True,
-            vad_analyzer=SileroVADAnalyzer(),
-        ),
-    )
-
     stt = SonioxSTTService(
         api_key=os.getenv("SONIOX_API_KEY"),
-        params=SonioxInputParams(
-            # Add language hints to improve transcription accuracy. Variants are ignored.
-            # For example "en-GB" will be treated same as "en".
-            # List of supported languages: https://soniox.com/docs/speech-to-text/core-concepts/supported-languages
-            language_hints=[Language.EN, Language.ES, Language.JA, Language.ZH],
-        ),
     )
 
     tts = CartesiaTTSService(
@@ -77,14 +80,11 @@ async def run_bot(webrtc_connection: SmallWebRTCConnection, _: argparse.Namespac
             context_aggregator.assistant(),  # Assistant spoken responses
         ]
     )
-
     task = PipelineTask(
         pipeline,
         params=PipelineParams(
-            allow_interruptions=True,
             enable_metrics=True,
             enable_usage_metrics=True,
-            report_only_initial_ttfb=True,
         ),
     )
 
@@ -98,18 +98,14 @@ async def run_bot(webrtc_connection: SmallWebRTCConnection, _: argparse.Namespac
     @transport.event_handler("on_client_disconnected")
     async def on_client_disconnected(transport, client):
         logger.info(f"Client disconnected")
-
-    @transport.event_handler("on_client_closed")
-    async def on_client_closed(transport, client):
-        logger.info(f"Client closed connection")
         await task.cancel()
 
-    runner = PipelineRunner(handle_sigint=False)
+    runner = PipelineRunner(handle_sigint=handle_sigint)
 
     await runner.run(task)
 
 
 if __name__ == "__main__":
-    from run import main
+    from pipecat.examples.run import main
 
-    main()
+    main(run_example, transport_params=transport_params)
diff --git a/examples/foundational/13f-soniox-transcription.py b/examples/foundational/13i-soniox-transcription.py
similarity index 67%
rename from examples/foundational/13f-soniox-transcription.py
rename to examples/foundational/13i-soniox-transcription.py
index 6e9d356f3..12760846c 100644
--- a/examples/foundational/13f-soniox-transcription.py
+++ b/examples/foundational/13i-soniox-transcription.py
@@ -10,6 +10,7 @@ import os
 from dotenv import load_dotenv
 from loguru import logger
 
+from pipecat.audio.vad.silero import SileroVADAnalyzer
 from pipecat.frames.frames import Frame, TranscriptionFrame
 from pipecat.pipeline.pipeline import Pipeline
 from pipecat.pipeline.runner import PipelineRunner
@@ -18,9 +19,11 @@ from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
 from pipecat.services.soniox.config import SonioxInputParams
 from pipecat.services.soniox.stt import SonioxSTTService
 from pipecat.transcriptions.language import Language
-from pipecat.transports.base_transport import TransportParams
+from pipecat.transports.base_transport import BaseTransport, TransportParams
+from pipecat.transports.network.fastapi_websocket import FastAPIWebsocketParams
 from pipecat.transports.network.small_webrtc import SmallWebRTCTransport
 from pipecat.transports.network.webrtc_connection import SmallWebRTCConnection
+from pipecat.transports.services.daily import DailyParams
 
 load_dotenv(override=True)
 
@@ -33,22 +36,27 @@ class TranscriptionLogger(FrameProcessor):
             print(f"Transcription: {frame.text}")
 
 
-async def run_bot(webrtc_connection: SmallWebRTCConnection, _: argparse.Namespace):
-    logger.info(f"Starting bot")
+transport_params = {
+    "daily": lambda: DailyParams(
+        audio_in_enabled=True,
+        vad_analyzer=SileroVADAnalyzer(),
+    ),
+    "twilio": lambda: FastAPIWebsocketParams(
+        audio_in_enabled=True,
+        vad_analyzer=SileroVADAnalyzer(),
+    ),
+    "webrtc": lambda: TransportParams(
+        audio_in_enabled=True,
+        vad_analyzer=SileroVADAnalyzer(),
+    ),
+}
 
-    transport = SmallWebRTCTransport(
-        webrtc_connection=webrtc_connection,
-        params=TransportParams(audio_in_enabled=True),
-    )
+
+async def run_example(transport: BaseTransport, _: argparse.Namespace, handle_sigint: bool):
+    logger.info(f"Starting bot")
 
     stt = SonioxSTTService(
         api_key=os.getenv("SONIOX_API_KEY"),
-        params=SonioxInputParams(
-            # Add language hints to improve transcription accuracy. Variants are ignored.
-            # For example "en-GB" will be treated same as "en".
-            # List of supported languages: https://soniox.com/docs/speech-to-text/core-concepts/supported-languages
-            language_hints=[Language.EN, Language.ES, Language.JA, Language.ZH],
-        ),
     )
 
     tl = TranscriptionLogger()
@@ -72,6 +80,6 @@ async def run_bot(webrtc_connection: SmallWebRTCConnection, _: argparse.Namespac
 
 
 if __name__ == "__main__":
-    from run import main
+    from pipecat.examples.run import main
 
-    main()
+    main(run_example, transport_params=transport_params)
diff --git a/src/pipecat/services/soniox/stt.py b/src/pipecat/services/soniox/stt.py
index e598422a9..85b755d7e 100644
--- a/src/pipecat/services/soniox/stt.py
+++ b/src/pipecat/services/soniox/stt.py
@@ -88,7 +88,7 @@ class SonioxSTTService(STTService):
         url: str = "wss://stt-rt.soniox.com/transcribe-websocket",
         sample_rate: Optional[int] = None,
         params: Optional[SonioxInputParams] = None,
-        enable_vad: bool = True,
+        enable_vad: bool = False,
         auto_finalize_delay_ms: Optional[int] = 3000,
         **kwargs,
     ):
@@ -108,7 +108,7 @@ class SonioxSTTService(STTService):
                 to `None`, the auto finalize feature is disabled.
             **kwargs: Additional arguments passed to the STTService.
         """
-        sample_rate = sample_rate or (params.sample_rate if params.sample_rate else None)
+        sample_rate = sample_rate or (params.sample_rate if params and params.sample_rate else None)
         super().__init__(sample_rate=sample_rate, **kwargs)
         params = params or SonioxInputParams()
 

From 7becce9e8cef5e239121f3ad3a65e4850523852a Mon Sep 17 00:00:00 2001
From: Matej Marinko <matej.marinko@soniox.com>
Date: Wed, 9 Jul 2025 09:37:58 +0200
Subject: [PATCH 10/18] Add transcript tracing

---
 src/pipecat/services/soniox/stt.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/src/pipecat/services/soniox/stt.py b/src/pipecat/services/soniox/stt.py
index 85b755d7e..547cb4e79 100644
--- a/src/pipecat/services/soniox/stt.py
+++ b/src/pipecat/services/soniox/stt.py
@@ -26,6 +26,7 @@ from pipecat.services.soniox.config import SonioxInputParams
 from pipecat.services.stt_service import STTService
 from pipecat.transcriptions.language import Language
 from pipecat.utils.time import time_now_iso8601
+from pipecat.utils.tracing.service_decorators import traced_stt
 
 try:
     import websockets
@@ -203,6 +204,13 @@ class SonioxSTTService(STTService):
 
         yield None
 
+    @traced_stt
+    async def _handle_transcription(
+        self, transcript: str, is_final: bool, language: Optional[Language] = None
+    ):
+        """Handle a transcription result with tracing."""
+        pass
+
     async def process_frame(self, frame: Frame, direction: FrameDirection):
         """Processes a frame of audio data, either buffering or transcribing it."""
         await super().process_frame(frame, direction)
@@ -253,6 +261,8 @@ class SonioxSTTService(STTService):
                         time_now_iso8601(),
                     )
                 )
+                await self._handle_transcription(self._final_transcription_buffer, is_final=True)
+                await self.stop_processing_metrics()
                 self._final_transcription_buffer = ""
 
         try:

From 98e24131bd24fd81b2910c390820b9716d7cbf2e Mon Sep 17 00:00:00 2001
From: Matej Marinko <matej.marinko@soniox.com>
Date: Wed, 9 Jul 2025 09:59:04 +0200
Subject: [PATCH 11/18] Send raw result

---
 src/pipecat/services/soniox/stt.py | 36 +++++++++++++++++++-----------
 1 file changed, 23 insertions(+), 13 deletions(-)

diff --git a/src/pipecat/services/soniox/stt.py b/src/pipecat/services/soniox/stt.py
index 547cb4e79..b5a582286 100644
--- a/src/pipecat/services/soniox/stt.py
+++ b/src/pipecat/services/soniox/stt.py
@@ -121,7 +121,7 @@ class SonioxSTTService(STTService):
         self._auto_finalize_delay_ms = auto_finalize_delay_ms
         self._websocket = None
 
-        self._final_transcription_buffer = ""
+        self._final_transcription_buffer = []
         self._last_tokens_received: Optional[float] = None
 
         self._receive_task = None
@@ -250,20 +250,22 @@ class SonioxSTTService(STTService):
             return
 
         # Transcription frame will be only sent after we get the "endpoint" event.
-        self._final_transcription_buffer = ""
+        self._final_transcription_buffer = []
 
         async def send_endpoint_transcript():
             if self._final_transcription_buffer:
+                text = "".join(map(lambda token: token["text"], self._final_transcription_buffer))
                 await self.push_frame(
                     TranscriptionFrame(
-                        self._final_transcription_buffer,
-                        self._user_id,
-                        time_now_iso8601(),
+                        text=text,
+                        user_id=self._user_id,
+                        timestamp=time_now_iso8601(),
+                        result=self._final_transcription_buffer,
                     )
                 )
-                await self._handle_transcription(self._final_transcription_buffer, is_final=True)
+                await self._handle_transcription(text, is_final=True)
                 await self.stop_processing_metrics()
-                self._final_transcription_buffer = ""
+                self._final_transcription_buffer = []
 
         try:
             async for message in self._websocket:
@@ -280,7 +282,7 @@ class SonioxSTTService(STTService):
                         self._last_tokens_received = time.time()
 
                 # We will only send the final tokens after we get the "endpoint" event.
-                non_final_transcription = ""
+                non_final_transcription = []
 
                 for token in tokens:
                     if token["is_final"]:
@@ -289,18 +291,26 @@ class SonioxSTTService(STTService):
                             # the rest will be sent as interim tokens (even final tokens).
                             await send_endpoint_transcript()
                         else:
-                            self._final_transcription_buffer += token["text"]
+                            self._final_transcription_buffer.append(token)
                     else:
-                        non_final_transcription += token["text"]
+                        non_final_transcription.append(token)
 
                 if self._final_transcription_buffer or non_final_transcription:
+                    final_text = "".join(
+                        map(lambda token: token["text"], self._final_transcription_buffer)
+                    )
+                    non_final_text = "".join(
+                        map(lambda token: token["text"], non_final_transcription)
+                    )
+
                     await self.push_frame(
                         InterimTranscriptionFrame(
                             # Even final tokens are sent as interim tokens as we want to send
                             # nicely formatted messages - therefore waiting for the endpoint.
-                            self._final_transcription_buffer + non_final_transcription,
-                            self._user_id,
-                            time_now_iso8601(),
+                            text=final_text + non_final_text,
+                            user_id=self._user_id,
+                            timestamp=time_now_iso8601(),
+                            result=self._final_transcription_buffer + non_final_transcription,
                         )
                     )
 

From c093eb5b636ccf245ac1ae7318c726543059b97f Mon Sep 17 00:00:00 2001
From: Matej Marinko <matej.marinko@soniox.com>
Date: Wed, 9 Jul 2025 10:20:37 +0200
Subject: [PATCH 12/18] Move config to main file

---
 .../foundational/07za-interruptible-soniox.py |  2 --
 .../foundational/13i-soniox-transcription.py  |  4 ---
 src/pipecat/services/soniox/config.py         | 36 -------------------
 src/pipecat/services/soniox/stt.py            | 27 +++++++++++++-
 4 files changed, 26 insertions(+), 43 deletions(-)
 delete mode 100644 src/pipecat/services/soniox/config.py

diff --git a/examples/foundational/07za-interruptible-soniox.py b/examples/foundational/07za-interruptible-soniox.py
index d909d5e1d..a879821a2 100644
--- a/examples/foundational/07za-interruptible-soniox.py
+++ b/examples/foundational/07za-interruptible-soniox.py
@@ -17,9 +17,7 @@ from pipecat.pipeline.task import PipelineParams, PipelineTask
 from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
 from pipecat.services.cartesia.tts import CartesiaTTSService
 from pipecat.services.openai.llm import OpenAILLMService
-from pipecat.services.soniox.config import SonioxInputParams
 from pipecat.services.soniox.stt import SonioxSTTService
-from pipecat.transcriptions.language import Language
 from pipecat.transports.base_transport import BaseTransport, TransportParams
 from pipecat.transports.network.fastapi_websocket import FastAPIWebsocketParams
 from pipecat.transports.services.daily import DailyParams
diff --git a/examples/foundational/13i-soniox-transcription.py b/examples/foundational/13i-soniox-transcription.py
index 12760846c..5bec62f28 100644
--- a/examples/foundational/13i-soniox-transcription.py
+++ b/examples/foundational/13i-soniox-transcription.py
@@ -16,13 +16,9 @@ from pipecat.pipeline.pipeline import Pipeline
 from pipecat.pipeline.runner import PipelineRunner
 from pipecat.pipeline.task import PipelineTask
 from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
-from pipecat.services.soniox.config import SonioxInputParams
 from pipecat.services.soniox.stt import SonioxSTTService
-from pipecat.transcriptions.language import Language
 from pipecat.transports.base_transport import BaseTransport, TransportParams
 from pipecat.transports.network.fastapi_websocket import FastAPIWebsocketParams
-from pipecat.transports.network.small_webrtc import SmallWebRTCTransport
-from pipecat.transports.network.webrtc_connection import SmallWebRTCConnection
 from pipecat.transports.services.daily import DailyParams
 
 load_dotenv(override=True)
diff --git a/src/pipecat/services/soniox/config.py b/src/pipecat/services/soniox/config.py
deleted file mode 100644
index 2980a9e81..000000000
--- a/src/pipecat/services/soniox/config.py
+++ /dev/null
@@ -1,36 +0,0 @@
-#
-# Copyright (c) 2024–2025, Daily
-#
-# SPDX-License-Identifier: BSD 2-Clause License
-#
-
-from typing import List, Optional
-
-from pydantic import BaseModel
-
-from pipecat.transcriptions.language import Language
-
-
-class SonioxInputParams(BaseModel):
-    """Real-time transcription settings.
-
-    Parameters:
-        languages: List of language codes to use for transcription
-        code_switching: Whether to auto-detect language changes during transcription
-    """
-
-    model: str = "stt-rt-preview"
-
-    audio_format: Optional[str] = "pcm_s16le"
-    num_channels: Optional[int] = 1
-    sample_rate: Optional[int] = 16000
-
-    language_hints: Optional[List[Language]] = None
-    context: Optional[str] = None
-
-    enable_non_final_tokens: Optional[bool] = True
-    max_non_final_tokens_duration_ms: Optional[int] = None
-
-    enable_endpoint_detection: Optional[bool] = True
-
-    client_reference_id: Optional[str] = None
diff --git a/src/pipecat/services/soniox/stt.py b/src/pipecat/services/soniox/stt.py
index b5a582286..9a85eaa6c 100644
--- a/src/pipecat/services/soniox/stt.py
+++ b/src/pipecat/services/soniox/stt.py
@@ -10,6 +10,7 @@ import time
 from typing import AsyncGenerator, List, Optional
 
 from loguru import logger
+from pydantic import BaseModel
 
 from pipecat.frames.frames import (
     CancelFrame,
@@ -22,7 +23,6 @@ from pipecat.frames.frames import (
     UserStoppedSpeakingFrame,
 )
 from pipecat.processors.frame_processor import FrameDirection
-from pipecat.services.soniox.config import SonioxInputParams
 from pipecat.services.stt_service import STTService
 from pipecat.transcriptions.language import Language
 from pipecat.utils.time import time_now_iso8601
@@ -45,6 +45,31 @@ END_TOKEN = "<end>"
 FINALIZED_TOKEN = "<fin>"
 
 
+class SonioxInputParams(BaseModel):
+    """Real-time transcription settings.
+
+    Parameters:
+        languages: List of language codes to use for transcription
+        code_switching: Whether to auto-detect language changes during transcription
+    """
+
+    model: str = "stt-rt-preview"
+
+    audio_format: Optional[str] = "pcm_s16le"
+    num_channels: Optional[int] = 1
+    sample_rate: Optional[int] = 16000
+
+    language_hints: Optional[List[Language]] = None
+    context: Optional[str] = None
+
+    enable_non_final_tokens: Optional[bool] = True
+    max_non_final_tokens_duration_ms: Optional[int] = None
+
+    enable_endpoint_detection: Optional[bool] = True
+
+    client_reference_id: Optional[str] = None
+
+
 def is_end_token(token: dict) -> bool:
     """Determine if a token is an end token."""
     return token["text"] == END_TOKEN or token["text"] == FINALIZED_TOKEN

From 61ac77be729a005dfc166a34d46131f4731bd038 Mon Sep 17 00:00:00 2001
From: Matej Marinko <matej.marinko@soniox.com>
Date: Wed, 9 Jul 2025 11:59:45 +0200
Subject: [PATCH 13/18] Update docs

---
 src/pipecat/services/soniox/stt.py | 41 +++++++++++++++++++++++++-----
 1 file changed, 35 insertions(+), 6 deletions(-)

diff --git a/src/pipecat/services/soniox/stt.py b/src/pipecat/services/soniox/stt.py
index 9a85eaa6c..adcc63601 100644
--- a/src/pipecat/services/soniox/stt.py
+++ b/src/pipecat/services/soniox/stt.py
@@ -49,8 +49,15 @@ class SonioxInputParams(BaseModel):
     """Real-time transcription settings.
 
     Parameters:
-        languages: List of language codes to use for transcription
-        code_switching: Whether to auto-detect language changes during transcription
+        model: Model to use for transcription.
+        audio_format: Audio format to use for transcription.
+        num_channels: Number of channels to use for transcription.
+        language_hints: List of language hints to use for transcription.
+        context: Customization for transcription.
+        enable_non_final_tokens: Whether to enable non-final tokens. If false, only final tokens will be returned.
+        max_non_final_tokens_duration_ms: Maximum duration of non-final tokens.
+        enable_endpoint_detection: Whether to enable endpoint detection. User will receive interim transcription until the endpoint is detected.
+        client_reference_id: Client reference ID to use for transcription.
     """
 
     model: str = "stt-rt-preview"
@@ -123,7 +130,6 @@ class SonioxSTTService(STTService):
         Args:
             api_key: Soniox API key.
             url: Soniox WebSocket API URL.
-            model: Transcription model to use.
             sample_rate: Audio sample rate. If None, uses value from `params`.
             params: Additional configuration parameters, such as language hints, context and
                 speaker diarization.
@@ -153,7 +159,11 @@ class SonioxSTTService(STTService):
         self._keepalive_task = None
 
     async def start(self, frame: StartFrame):
-        """Start the Soniox STT websocket connection."""
+        """Start the Soniox STT websocket connection.
+
+        Args:
+            frame: The start frame containing initialization parameters.
+        """
         await super().start(frame)
         if self._websocket:
             return
@@ -206,6 +216,9 @@ class SonioxSTTService(STTService):
 
         Stopping waits for the server to close the connection as we might receive
         additional final tokens after sending the stop recording message.
+
+        Args:
+            frame: The end frame.
         """
         await super().stop(frame)
         await self._send_stop_recording()
@@ -216,12 +229,22 @@ class SonioxSTTService(STTService):
         Compared to stop, this method closes the connection immediately without waiting
         for the server to close it. This is useful when we want to stop the connection
         immediately without waiting for the server to send any final tokens.
+
+        Args:
+            frame: The cancel frame.
         """
         await super().cancel(frame)
         await self._cleanup()
 
     async def run_stt(self, audio: bytes) -> AsyncGenerator[Frame, None]:
-        """Send audio data to Soniox STT Service."""
+        """Send audio data to Soniox STT Service.
+
+        Args:
+            audio: Raw audio bytes to transcribe.
+
+        Yields:
+            Frame: None (transcription results come via WebSocket callbacks).
+        """
         await self.start_processing_metrics()
         if self._websocket and not self._websocket.closed:
             await self._websocket.send(audio)
@@ -237,7 +260,12 @@ class SonioxSTTService(STTService):
         pass
 
     async def process_frame(self, frame: Frame, direction: FrameDirection):
-        """Processes a frame of audio data, either buffering or transcribing it."""
+        """Processes a frame of audio data, either buffering or transcribing it.
+
+        Args:
+            frame: The frame to process.
+            direction: The direction of frame processing.
+        """
         await super().process_frame(frame, direction)
 
         if isinstance(frame, UserStoppedSpeakingFrame) and self._enable_vad:
@@ -247,6 +275,7 @@ class SonioxSTTService(STTService):
                 logger.debug(f"Triggered finalize event on: {frame.name=}, {direction=}")
 
     async def _send_stop_recording(self):
+        """Send stop recording message to Soniox."""
         if self._websocket and not self._websocket.closed:
             # Send stop recording message
             await self._websocket.send("")

From 650d45c1f4dbbc318e7a9d40dbb516684d4153b1 Mon Sep 17 00:00:00 2001
From: Matej Marinko <matej.marinko@soniox.com>
Date: Fri, 11 Jul 2025 08:27:06 +0200
Subject: [PATCH 14/18] Use single sample rate parameter

---
 src/pipecat/services/soniox/stt.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/pipecat/services/soniox/stt.py b/src/pipecat/services/soniox/stt.py
index adcc63601..5698139bf 100644
--- a/src/pipecat/services/soniox/stt.py
+++ b/src/pipecat/services/soniox/stt.py
@@ -48,6 +48,9 @@ FINALIZED_TOKEN = "<fin>"
 class SonioxInputParams(BaseModel):
     """Real-time transcription settings.
 
+    See Soniox WebSocket API documentation for more details:
+    https://soniox.com/docs/speech-to-text/api-reference/websocket-api#configuration-parameters
+
     Parameters:
         model: Model to use for transcription.
         audio_format: Audio format to use for transcription.
@@ -64,7 +67,6 @@ class SonioxInputParams(BaseModel):
 
     audio_format: Optional[str] = "pcm_s16le"
     num_channels: Optional[int] = 1
-    sample_rate: Optional[int] = 16000
 
     language_hints: Optional[List[Language]] = None
     context: Optional[str] = None
@@ -130,7 +132,7 @@ class SonioxSTTService(STTService):
         Args:
             api_key: Soniox API key.
             url: Soniox WebSocket API URL.
-            sample_rate: Audio sample rate. If None, uses value from `params`.
+            sample_rate: Audio sample rate.
             params: Additional configuration parameters, such as language hints, context and
                 speaker diarization.
             enable_vad: Listen to `UserStoppedSpeakingFrame` to send finalize message to Soniox.
@@ -140,7 +142,6 @@ class SonioxSTTService(STTService):
                 to `None`, the auto finalize feature is disabled.
             **kwargs: Additional arguments passed to the STTService.
         """
-        sample_rate = sample_rate or (params.sample_rate if params and params.sample_rate else None)
         super().__init__(sample_rate=sample_rate, **kwargs)
         params = params or SonioxInputParams()
 
@@ -180,7 +181,7 @@ class SonioxSTTService(STTService):
             "audio_format": self._params.audio_format,
             "num_channels": self._params.num_channels or 1,
             "enable_endpoint_detection": self._params.enable_endpoint_detection,
-            "sample_rate": self._sample_rate,
+            "sample_rate": self.sample_rate,
             "language_hints": _prepare_language_hints(self._params.language_hints),
             "context": self._params.context,
             "enable_non_final_tokens": self._params.enable_non_final_tokens,

From 2e84c91748e50b58b5d2939952a89a77912ba955 Mon Sep 17 00:00:00 2001
From: Matej Marinko <matej.marinko@soniox.com>
Date: Fri, 11 Jul 2025 08:52:39 +0200
Subject: [PATCH 15/18] Remove outdated parameter

---
 src/pipecat/services/soniox/stt.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/src/pipecat/services/soniox/stt.py b/src/pipecat/services/soniox/stt.py
index 5698139bf..ddf125256 100644
--- a/src/pipecat/services/soniox/stt.py
+++ b/src/pipecat/services/soniox/stt.py
@@ -124,7 +124,6 @@ class SonioxSTTService(STTService):
         sample_rate: Optional[int] = None,
         params: Optional[SonioxInputParams] = None,
         enable_vad: bool = False,
-        auto_finalize_delay_ms: Optional[int] = 3000,
         **kwargs,
     ):
         """Initialize the Soniox STT service.
@@ -136,10 +135,6 @@ class SonioxSTTService(STTService):
             params: Additional configuration parameters, such as language hints, context and
                 speaker diarization.
             enable_vad: Listen to `UserStoppedSpeakingFrame` to send finalize message to Soniox.
-            auto_finalize_delay_ms: If no new tokens are received for a while and there is active
-                transcription (only InterimTranscriptionFrame), finalize the transcription by
-                sending the finalize message so user can receive the final transcript. If set
-                to `None`, the auto finalize feature is disabled.
             **kwargs: Additional arguments passed to the STTService.
         """
         super().__init__(sample_rate=sample_rate, **kwargs)
@@ -150,7 +145,6 @@ class SonioxSTTService(STTService):
         self.set_model_name(params.model)
         self._params = params
         self._enable_vad = enable_vad
-        self._auto_finalize_delay_ms = auto_finalize_delay_ms
         self._websocket = None
 
         self._final_transcription_buffer = []

From 5c3fb73cef8ff895fca29410615ad1c223f5f422 Mon Sep 17 00:00:00 2001
From: Matej Marinko <matej.marinko@soniox.com>
Date: Fri, 11 Jul 2025 16:07:24 +0200
Subject: [PATCH 16/18] Rename example

---
 ...{07za-interruptible-soniox.py => 07aa-interruptible-soniox.py} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename examples/foundational/{07za-interruptible-soniox.py => 07aa-interruptible-soniox.py} (100%)

diff --git a/examples/foundational/07za-interruptible-soniox.py b/examples/foundational/07aa-interruptible-soniox.py
similarity index 100%
rename from examples/foundational/07za-interruptible-soniox.py
rename to examples/foundational/07aa-interruptible-soniox.py

From c969fdddb9689af01d23b8602612885a76ea427c Mon Sep 17 00:00:00 2001
From: Matej Marinko <matej.marinko@soniox.com>
Date: Wed, 16 Jul 2025 09:47:34 +0200
Subject: [PATCH 17/18] Rename and simplify VAD finalization parameter usage

---
 src/pipecat/services/soniox/stt.py | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/src/pipecat/services/soniox/stt.py b/src/pipecat/services/soniox/stt.py
index ddf125256..cdc73a938 100644
--- a/src/pipecat/services/soniox/stt.py
+++ b/src/pipecat/services/soniox/stt.py
@@ -59,7 +59,6 @@ class SonioxInputParams(BaseModel):
         context: Customization for transcription.
         enable_non_final_tokens: Whether to enable non-final tokens. If false, only final tokens will be returned.
         max_non_final_tokens_duration_ms: Maximum duration of non-final tokens.
-        enable_endpoint_detection: Whether to enable endpoint detection. User will receive interim transcription until the endpoint is detected.
         client_reference_id: Client reference ID to use for transcription.
     """
 
@@ -74,8 +73,6 @@ class SonioxInputParams(BaseModel):
     enable_non_final_tokens: Optional[bool] = True
     max_non_final_tokens_duration_ms: Optional[int] = None
 
-    enable_endpoint_detection: Optional[bool] = True
-
     client_reference_id: Optional[str] = None
 
 
@@ -123,7 +120,7 @@ class SonioxSTTService(STTService):
         url: str = "wss://stt-rt.soniox.com/transcribe-websocket",
         sample_rate: Optional[int] = None,
         params: Optional[SonioxInputParams] = None,
-        enable_vad: bool = False,
+        vad_force_turn_endpoint: bool = False,
         **kwargs,
     ):
         """Initialize the Soniox STT service.
@@ -134,7 +131,7 @@ class SonioxSTTService(STTService):
             sample_rate: Audio sample rate.
             params: Additional configuration parameters, such as language hints, context and
                 speaker diarization.
-            enable_vad: Listen to `UserStoppedSpeakingFrame` to send finalize message to Soniox.
+            vad_force_turn_endpoint: Listen to `UserStoppedSpeakingFrame` to send finalize message to Soniox. If disabled, Soniox will detect the end of the speech.
             **kwargs: Additional arguments passed to the STTService.
         """
         super().__init__(sample_rate=sample_rate, **kwargs)
@@ -144,7 +141,7 @@ class SonioxSTTService(STTService):
         self._url = url
         self.set_model_name(params.model)
         self._params = params
-        self._enable_vad = enable_vad
+        self._vad_force_turn_endpoint = vad_force_turn_endpoint
         self._websocket = None
 
         self._final_transcription_buffer = []
@@ -168,13 +165,17 @@ class SonioxSTTService(STTService):
         if not self._websocket:
             logger.error(f"Unable to connect to Soniox API at {self._url}")
 
+        # If vad_force_turn_endpoint is not enabled, we need to enable endpoint detection.
+        # Either one or the other is required.
+        enable_endpoint_detection = not self._vad_force_turn_endpoint
+
         # Send the initial configuration message.
         config = {
             "api_key": self._api_key,
             "model": self._model_name,
             "audio_format": self._params.audio_format,
             "num_channels": self._params.num_channels or 1,
-            "enable_endpoint_detection": self._params.enable_endpoint_detection,
+            "enable_endpoint_detection": enable_endpoint_detection,
             "sample_rate": self.sample_rate,
             "language_hints": _prepare_language_hints(self._params.language_hints),
             "context": self._params.context,
@@ -263,7 +264,7 @@ class SonioxSTTService(STTService):
         """
         await super().process_frame(frame, direction)
 
-        if isinstance(frame, UserStoppedSpeakingFrame) and self._enable_vad:
+        if isinstance(frame, UserStoppedSpeakingFrame) and self._vad_force_turn_endpoint:
             # Send finalize message to Soniox so we get the final tokens asap.
             if self._websocket and not self._websocket.closed:
                 await self._websocket.send(FINALIZE_MESSAGE)

From cb984237a7ef711a89d7de5fe7151822b12fbe70 Mon Sep 17 00:00:00 2001
From: Matej Marinko <matej.marinko@soniox.com>
Date: Wed, 16 Jul 2025 16:54:28 +0200
Subject: [PATCH 18/18] Fix lint error

---
 src/pipecat/services/soniox/stt.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/pipecat/services/soniox/stt.py b/src/pipecat/services/soniox/stt.py
index cdc73a938..f51653876 100644
--- a/src/pipecat/services/soniox/stt.py
+++ b/src/pipecat/services/soniox/stt.py
@@ -4,6 +4,8 @@
 # SPDX-License-Identifier: BSD 2-Clause License
 #
 
+"""Soniox speech-to-text service implementation."""
+
 import asyncio
 import json
 import time