diff --git a/changelog/3697.changed.2.md b/changelog/3697.changed.2.md new file mode 100644 index 000000000..0307bd59a --- /dev/null +++ b/changelog/3697.changed.2.md @@ -0,0 +1 @@ +- Update `SonioxSTTService` to set `vad_force_turn_endpoint` to `True`. This setting disabled the turn detection logic available natively in Soniox. Instead, Soniox relies on a local VAD to finalize the transcript. This configuration meaningfully reduces the time to final segment for Soniox. With this setting enabled, Soniox outputs a transcript in ~250ms (median). Pipecat enables smart-turn detection by default using the `LocalSmartTurnAnalyzerV3`. To use the native turn detection logic in Soniox, just set `vad_force_turn_endpoint` to `False`. \ No newline at end of file diff --git a/examples/foundational/07za-interruptible-soniox.py b/examples/foundational/07za-interruptible-soniox.py index b896a43b9..a60cfe992 100644 --- a/examples/foundational/07za-interruptible-soniox.py +++ b/examples/foundational/07za-interruptible-soniox.py @@ -53,7 +53,6 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments): stt = SonioxSTTService( api_key=os.getenv("SONIOX_API_KEY"), - vad_force_turn_endpoint=True, params=SonioxInputParams( language_hints=[Language.EN], language_hints_strict=True, diff --git a/src/pipecat/services/soniox/stt.py b/src/pipecat/services/soniox/stt.py index e3e9e3e5c..fb5b78e4a 100644 --- a/src/pipecat/services/soniox/stt.py +++ b/src/pipecat/services/soniox/stt.py @@ -152,7 +152,7 @@ class SonioxSTTService(WebsocketSTTService): url: str = "wss://stt-rt.soniox.com/transcribe-websocket", sample_rate: Optional[int] = None, params: Optional[SonioxInputParams] = None, - vad_force_turn_endpoint: bool = False, + vad_force_turn_endpoint: bool = True, ttfs_p99_latency: Optional[float] = SONIOX_TTFS_P99, **kwargs, ): @@ -164,7 +164,8 @@ class SonioxSTTService(WebsocketSTTService): sample_rate: Audio sample rate. params: Additional configuration parameters, such as language hints, context and speaker diarization. - vad_force_turn_endpoint: Listen to `VADUserStoppedSpeakingFrame` to send finalize message to Soniox. If disabled, Soniox will detect the end of the speech. + vad_force_turn_endpoint: Listen to `VADUserStoppedSpeakingFrame` to send finalize message to Soniox. + If disabled, Soniox will detect the end of the speech. Defaults to True. ttfs_p99_latency: P99 latency from speech end to final transcript in seconds. Override for your deployment. See https://github.com/pipecat-ai/stt-benchmark **kwargs: Additional arguments passed to the STTService.