From a7bf9f538cbfcd5616f779c7e33277fef76ac324 Mon Sep 17 00:00:00 2001 From: Mark Backman Date: Tue, 7 Apr 2026 12:56:10 -0400 Subject: [PATCH 1/5] Clean up comments in MistralTTSService --- src/pipecat/services/mistral/tts.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/pipecat/services/mistral/tts.py b/src/pipecat/services/mistral/tts.py index 7a9c7dea2..d00b98a95 100644 --- a/src/pipecat/services/mistral/tts.py +++ b/src/pipecat/services/mistral/tts.py @@ -71,12 +71,10 @@ class MistralTTSService(TTSService): """Initialize Mistral TTS service. Args: - api_key: Mistral API key for authentication. If None, uses - MISTRAL_API_KEY environment variable. + api_key: Mistral API key for authentication. sample_rate: Output audio sample rate in Hz. Audio is resampled from Mistral's native 24kHz when a different rate is requested. - settings: Runtime-updatable settings. When provided alongside deprecated - parameters, ``settings`` values take precedence. + settings: Runtime-updatable settings. **kwargs: Additional keyword arguments passed to TTSService. """ # Initialize default_settings with hardcoded defaults From 68a3070ad42ebdd7ea90eafe52175b9b07236302 Mon Sep 17 00:00:00 2001 From: Mark Backman Date: Tue, 7 Apr 2026 15:26:56 -0400 Subject: [PATCH 2/5] Add Mistral Voxtral Realtime STT service --- .../transcription/transcription-mistral.py | 93 ++++++ examples/voice/voice-mistral.py | 4 +- src/pipecat/services/mistral/stt.py | 315 ++++++++++++++++++ src/pipecat/services/stt_latency.py | 1 + 4 files changed, 411 insertions(+), 2 deletions(-) create mode 100644 examples/transcription/transcription-mistral.py create mode 100644 src/pipecat/services/mistral/stt.py diff --git a/examples/transcription/transcription-mistral.py b/examples/transcription/transcription-mistral.py new file mode 100644 index 000000000..b040b457c --- /dev/null +++ b/examples/transcription/transcription-mistral.py @@ -0,0 +1,93 @@ +# +# Copyright (c) 2024-2026, Daily +# +# SPDX-License-Identifier: BSD 2-Clause License +# + +import os + +from dotenv import load_dotenv +from loguru import logger + +from pipecat.audio.vad.silero import SileroVADAnalyzer +from pipecat.frames.frames import Frame, TranscriptionFrame +from pipecat.pipeline.pipeline import Pipeline +from pipecat.pipeline.runner import PipelineRunner +from pipecat.pipeline.task import PipelineParams, PipelineTask +from pipecat.processors.audio.vad_processor import VADProcessor +from pipecat.processors.frame_processor import FrameDirection, FrameProcessor +from pipecat.runner.types import RunnerArguments +from pipecat.runner.utils import create_transport +from pipecat.services.mistral.stt import MistralSTTService +from pipecat.transports.base_transport import BaseTransport, TransportParams +from pipecat.transports.daily.transport import DailyParams +from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams + +load_dotenv(override=True) + + +class TranscriptionLogger(FrameProcessor): + async def process_frame(self, frame: Frame, direction: FrameDirection): + await super().process_frame(frame, direction) + + if isinstance(frame, TranscriptionFrame): + print(f"Transcription: {frame.text}") + + # Push all frames through + await self.push_frame(frame, direction) + + +transport_params = { + "daily": lambda: DailyParams( + audio_in_enabled=True, + ), + "twilio": lambda: FastAPIWebsocketParams( + audio_in_enabled=True, + ), + "webrtc": lambda: TransportParams( + audio_in_enabled=True, + ), +} + + +async def run_bot(transport: BaseTransport, runner_args: RunnerArguments): + logger.info(f"Starting bot") + + stt = MistralSTTService( + api_key=os.getenv("MISTRAL_API_KEY"), + ) + + tl = TranscriptionLogger() + vad_processor = VADProcessor(vad_analyzer=SileroVADAnalyzer()) + + pipeline = Pipeline([transport.input(), vad_processor, stt, tl]) + + task = PipelineTask( + pipeline, + params=PipelineParams( + enable_metrics=True, + enable_usage_metrics=True, + ), + idle_timeout_secs=runner_args.pipeline_idle_timeout_secs, + ) + + @transport.event_handler("on_client_disconnected") + async def on_client_disconnected(transport, client): + logger.info(f"Client disconnected") + await task.cancel() + + runner = PipelineRunner(handle_sigint=runner_args.handle_sigint) + + await runner.run(task) + + +async def bot(runner_args: RunnerArguments): + """Main bot entry point compatible with Pipecat Cloud.""" + transport = await create_transport(runner_args, transport_params) + await run_bot(transport, runner_args) + + +if __name__ == "__main__": + from pipecat.runner.run import main + + main() diff --git a/examples/voice/voice-mistral.py b/examples/voice/voice-mistral.py index 9c8c8789b..440039b65 100644 --- a/examples/voice/voice-mistral.py +++ b/examples/voice/voice-mistral.py @@ -22,7 +22,7 @@ from pipecat.processors.aggregators.llm_response_universal import ( ) from pipecat.runner.types import RunnerArguments from pipecat.runner.utils import create_transport -from pipecat.services.deepgram.stt import DeepgramSTTService +from pipecat.services.mistral.stt import MistralSTTService from pipecat.services.mistral.tts import MistralTTSService from pipecat.services.openai.llm import OpenAILLMService from pipecat.transports.base_transport import BaseTransport, TransportParams @@ -53,7 +53,7 @@ transport_params = { async def run_bot(transport: BaseTransport, runner_args: RunnerArguments): logger.info(f"Starting bot") - stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY")) + stt = MistralSTTService(api_key=os.getenv("MISTRAL_API_KEY")) tts = MistralTTSService( api_key=os.getenv("MISTRAL_API_KEY"), diff --git a/src/pipecat/services/mistral/stt.py b/src/pipecat/services/mistral/stt.py new file mode 100644 index 000000000..c41768d15 --- /dev/null +++ b/src/pipecat/services/mistral/stt.py @@ -0,0 +1,315 @@ +# +# Copyright (c) 2024-2026, Daily +# +# SPDX-License-Identifier: BSD 2-Clause License +# + +"""Mistral Speech-to-Text service implementation. + +This module provides a real-time STT service that integrates with Mistral's +Voxtral Realtime transcription API using the Mistral SDK's RealtimeConnection. +""" + +from dataclasses import dataclass +from typing import Any, AsyncGenerator, Optional + +from loguru import logger + +from pipecat.frames.frames import ( + CancelFrame, + EndFrame, + Frame, + InterimTranscriptionFrame, + StartFrame, + TranscriptionFrame, + VADUserStartedSpeakingFrame, + VADUserStoppedSpeakingFrame, +) +from pipecat.processors.frame_processor import FrameDirection +from pipecat.services.settings import STTSettings +from pipecat.services.stt_latency import MISTRAL_TTFS_P99 +from pipecat.services.stt_service import STTService +from pipecat.utils.time import time_now_iso8601 +from pipecat.utils.tracing.service_decorators import traced_stt + +try: + from mistralai.client import Mistral + from mistralai.client.models import ( + AudioFormat, + RealtimeTranscriptionError, + RealtimeTranscriptionSessionCreated, + TranscriptionStreamDone, + TranscriptionStreamLanguage, + TranscriptionStreamTextDelta, + ) + from mistralai.extra.realtime import RealtimeConnection, UnknownRealtimeEvent +except ModuleNotFoundError as e: + logger.error(f"Exception: {e}") + logger.error("In order to use Mistral STT, you need to `pip install pipecat-ai[mistral]`.") + raise Exception(f"Missing module: {e}") + + +@dataclass +class MistralSTTSettings(STTSettings): + """Settings for MistralSTTService. + + Parameters: + model: STT model identifier. + language: Language hint for transcription. + """ + + pass + + +class MistralSTTService(STTService): + """Mistral Speech-to-Text service using the Voxtral Realtime API. + + This service uses the Mistral SDK's RealtimeConnection to stream audio + and receive transcription events over WebSocket. It extends STTService + directly (rather than WebsocketSTTService) because the SDK manages + the WebSocket connection internally. + + Event handlers available: + + - on_connected: Called when a transcription session is created. + - on_disconnected: Called when the connection is closed. + - on_connection_error: Called when a transcription error occurs. + + Example:: + + @stt.event_handler("on_connected") + async def on_connected(stt): + logger.info("Mistral STT connected") + """ + + Settings = MistralSTTSettings + _settings: Settings + + def __init__( + self, + *, + api_key: Optional[str] = None, + base_url: Optional[str] = None, + sample_rate: Optional[int] = None, + target_streaming_delay_ms: Optional[int] = None, + ttfs_p99_latency: Optional[float] = MISTRAL_TTFS_P99, + settings: Optional[Settings] = None, + **kwargs, + ): + """Initialize Mistral STT service. + + Args: + api_key: Mistral API key for authentication. + base_url: Custom API endpoint URL. + sample_rate: Audio sample rate in Hz. If None, uses the pipeline + sample rate. + target_streaming_delay_ms: Streaming delay for accuracy/latency + tradeoff. Higher values may improve accuracy at the cost of + latency. + ttfs_p99_latency: P99 latency from speech end to final transcript + in seconds. Override for your deployment. + settings: Runtime-updatable settings. + **kwargs: Additional keyword arguments passed to STTService. + """ + default_settings = self.Settings( + model="voxtral-mini-transcribe-realtime-2602", + language=None, + ) + + if settings is not None: + default_settings.apply_update(settings) + + super().__init__( + sample_rate=sample_rate, + ttfs_p99_latency=ttfs_p99_latency, + settings=default_settings, + **kwargs, + ) + + self._client = Mistral(api_key=api_key, server_url=base_url) + self._target_streaming_delay_ms = target_streaming_delay_ms + self._connection: Optional[RealtimeConnection] = None + self._receive_task = None + self._accumulated_text = "" + self._detected_language: Optional[str] = None + + def can_generate_metrics(self) -> bool: + """Check if the service can generate processing metrics. + + Returns: + True, indicating metrics are supported. + """ + return True + + async def start(self, frame: StartFrame): + """Start the STT service and establish connection. + + Args: + frame: Frame indicating service should start. + """ + await super().start(frame) + await self._connect() + + async def stop(self, frame: EndFrame): + """Stop the STT service and close connection. + + Args: + frame: Frame indicating service should stop. + """ + await super().stop(frame) + await self._disconnect() + + async def cancel(self, frame: CancelFrame): + """Cancel the STT service and close connection. + + Args: + frame: Frame indicating service should be cancelled. + """ + await super().cancel(frame) + await self._disconnect() + + async def process_frame(self, frame: Frame, direction: FrameDirection): + """Process incoming frames and handle speech events. + + Args: + frame: The frame to process. + direction: Direction of frame flow in the pipeline. + """ + await super().process_frame(frame, direction) + + if isinstance(frame, VADUserStartedSpeakingFrame): + self._accumulated_text = "" + await self._start_metrics() + elif isinstance(frame, VADUserStoppedSpeakingFrame): + if self._connection and not self._connection.is_closed: + await self._connection.flush_audio() + + async def run_stt(self, audio: bytes) -> AsyncGenerator[Frame, None]: + """Send audio data to Mistral for transcription. + + Args: + audio: Raw audio bytes to transcribe. + + Yields: + None - transcription results arrive via the receive events task. + """ + if not self._connection or self._connection.is_closed: + await self._connect() + + await self._connection.send_audio(audio) + yield None + + async def _start_metrics(self): + """Start performance metrics collection for transcription processing.""" + await self.start_processing_metrics() + + async def _connect(self): + """Establish a connection to the Mistral Realtime API.""" + try: + logger.debug(f"{self}: Connecting to Mistral STT") + + audio_format = AudioFormat( + encoding="pcm_s16le", + sample_rate=self.sample_rate, + ) + + self._connection = await self._client.audio.realtime.connect( + model=self._settings.model, + audio_format=audio_format, + target_streaming_delay_ms=self._target_streaming_delay_ms, + ) + + self._receive_task = self.create_task( + self._receive_events(), name="mistral_stt_receive" + ) + except Exception as e: + await self.push_error(error_msg=f"Error connecting to Mistral STT: {e}", exception=e) + + async def _disconnect(self): + """Close the connection and cancel the receive task.""" + if self._receive_task: + await self.cancel_task(self._receive_task) + self._receive_task = None + + if self._connection and not self._connection.is_closed: + try: + logger.debug(f"{self}: Disconnecting from Mistral STT") + await self._connection.close() + except Exception as e: + logger.warning(f"{self}: Error closing connection: {e}") + finally: + self._connection = None + await self._call_event_handler("on_disconnected") + + async def _receive_events(self): + """Background task: iterate connection events and handle them.""" + try: + async for event in self._connection.events(): + if isinstance(event, RealtimeTranscriptionSessionCreated): + logger.debug(f"{self}: Session created: {event.session}") + await self._call_event_handler("on_connected") + + elif isinstance(event, TranscriptionStreamTextDelta): + self._accumulated_text += event.text + await self.push_frame( + InterimTranscriptionFrame( + self._accumulated_text, + self._user_id, + time_now_iso8601(), + ) + ) + + elif isinstance(event, TranscriptionStreamDone): + if event.text: + await self.push_frame( + TranscriptionFrame( + event.text, + self._user_id, + time_now_iso8601(), + language=self._detected_language, + ) + ) + await self._handle_transcription(event.text, True, self._detected_language) + await self.stop_processing_metrics() + self._accumulated_text = "" + + elif isinstance(event, TranscriptionStreamLanguage): + self._detected_language = event.audio_language + + elif isinstance(event, RealtimeTranscriptionError): + error_msg = event.error.message if event.error else "Unknown error" + await self.push_error(error_msg=f"Mistral STT error: {error_msg}") + await self._call_event_handler("on_connection_error", error_msg) + + elif isinstance(event, UnknownRealtimeEvent): + logger.warning(f"{self}: Unknown realtime event: {event}") + + except Exception as e: + await self.push_error(error_msg=f"Mistral STT receive error: {e}", exception=e) + await self._call_event_handler("on_connection_error", str(e)) + finally: + self._connection = None + + @traced_stt + async def _handle_transcription( + self, transcript: str, is_final: bool, language: Optional[str] = None + ): + """Handle a transcription result with tracing.""" + pass + + async def _update_settings(self, delta: STTSettings) -> dict[str, Any]: + """Apply a settings delta, reconnecting if model or language changes. + + Args: + delta: An STT settings delta. + + Returns: + Dict mapping changed field names to their previous values. + """ + changed = await super()._update_settings(delta) + + if changed: + await self._disconnect() + await self._connect() + + return changed diff --git a/src/pipecat/services/stt_latency.py b/src/pipecat/services/stt_latency.py index 403902379..5ffd798de 100644 --- a/src/pipecat/services/stt_latency.py +++ b/src/pipecat/services/stt_latency.py @@ -55,4 +55,5 @@ NVIDIA_TTFS_P99: float = DEFAULT_TTFS_P99 WHISPER_TTFS_P99: float = DEFAULT_TTFS_P99 # No benchmark available yet; using conservative default +MISTRAL_TTFS_P99: float = DEFAULT_TTFS_P99 SMALLEST_TTFS_P99: float = DEFAULT_TTFS_P99 From 9131fa5c12ced2937d96ba88f9f7d8d521ccea30 Mon Sep 17 00:00:00 2001 From: Mark Backman Date: Tue, 7 Apr 2026 15:32:38 -0400 Subject: [PATCH 3/5] Add changelog for PR #4253 --- changelog/4253.added.md | 1 + 1 file changed, 1 insertion(+) create mode 100644 changelog/4253.added.md diff --git a/changelog/4253.added.md b/changelog/4253.added.md new file mode 100644 index 000000000..c9a9f91de --- /dev/null +++ b/changelog/4253.added.md @@ -0,0 +1 @@ +- Added `MistralSTTService` for real-time speech-to-text using Mistral's Voxtral Realtime API (`voxtral-mini-transcribe-realtime-2602`). Supports streaming transcription with interim results, automatic language detection, and VAD-driven utterance lifecycle. From 874e2878bec5cdc04f5be60bd777b85af05d0d75 Mon Sep 17 00:00:00 2001 From: Mark Backman Date: Tue, 7 Apr 2026 15:36:22 -0400 Subject: [PATCH 4/5] Update README with Mistral services --- README.md | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index 5e4217567..5b652b328 100644 --- a/README.md +++ b/README.md @@ -85,20 +85,20 @@ Catch new features, interviews, and how-tos on our [Pipecat TV](https://www.yout ## 🧩 Available services -| Category | Services | -| ------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| Speech-to-Text | [AssemblyAI](https://docs.pipecat.ai/server/services/stt/assemblyai), [AWS](https://docs.pipecat.ai/server/services/stt/aws), [Azure](https://docs.pipecat.ai/server/services/stt/azure), [Cartesia](https://docs.pipecat.ai/server/services/stt/cartesia), [Deepgram](https://docs.pipecat.ai/server/services/stt/deepgram), [ElevenLabs](https://docs.pipecat.ai/server/services/stt/elevenlabs), [Fal Wizper](https://docs.pipecat.ai/server/services/stt/fal), [Gladia](https://docs.pipecat.ai/server/services/stt/gladia), [Google](https://docs.pipecat.ai/server/services/stt/google), [Gradium](https://docs.pipecat.ai/server/services/stt/gradium), [Groq (Whisper)](https://docs.pipecat.ai/server/services/stt/groq), [NVIDIA Riva](https://docs.pipecat.ai/server/services/stt/riva), [OpenAI (Whisper)](https://docs.pipecat.ai/server/services/stt/openai), [Sarvam](https://docs.pipecat.ai/server/services/stt/sarvam), [Soniox](https://docs.pipecat.ai/server/services/stt/soniox), [Speechmatics](https://docs.pipecat.ai/server/services/stt/speechmatics), [Whisper](https://docs.pipecat.ai/server/services/stt/whisper) | -| LLMs | [Anthropic](https://docs.pipecat.ai/server/services/llm/anthropic), [AWS](https://docs.pipecat.ai/server/services/llm/aws), [Azure](https://docs.pipecat.ai/server/services/llm/azure), [Cerebras](https://docs.pipecat.ai/server/services/llm/cerebras), [DeepSeek](https://docs.pipecat.ai/server/services/llm/deepseek), [Fireworks AI](https://docs.pipecat.ai/server/services/llm/fireworks), [Gemini](https://docs.pipecat.ai/server/services/llm/gemini), [Grok](https://docs.pipecat.ai/server/services/llm/grok), [Groq](https://docs.pipecat.ai/server/services/llm/groq), [Mistral](https://docs.pipecat.ai/server/services/llm/mistral), [Nebius](https://docs.pipecat.ai/server/services/llm/nebius), [Novita](https://docs.pipecat.ai/server/services/llm/novita), [NVIDIA NIM](https://docs.pipecat.ai/server/services/llm/nvidia), [Ollama](https://docs.pipecat.ai/server/services/llm/ollama), [OpenAI](https://docs.pipecat.ai/server/services/llm/openai), [OpenRouter](https://docs.pipecat.ai/server/services/llm/openrouter), [Perplexity](https://docs.pipecat.ai/server/services/llm/perplexity), [Qwen](https://docs.pipecat.ai/server/services/llm/qwen), [SambaNova](https://docs.pipecat.ai/server/services/llm/sambanova), [Sarvam](https://docs.pipecat.ai/server/services/llm/sarvam), [Together AI](https://docs.pipecat.ai/server/services/llm/together) | -| Text-to-Speech | [Async](https://docs.pipecat.ai/server/services/tts/asyncai), [AWS](https://docs.pipecat.ai/server/services/tts/aws), [Azure](https://docs.pipecat.ai/server/services/tts/azure), [Camb AI](https://docs.pipecat.ai/server/services/tts/camb), [Cartesia](https://docs.pipecat.ai/server/services/tts/cartesia), [Deepgram](https://docs.pipecat.ai/server/services/tts/deepgram), [ElevenLabs](https://docs.pipecat.ai/server/services/tts/elevenlabs), [Fish](https://docs.pipecat.ai/server/services/tts/fish), [Google](https://docs.pipecat.ai/server/services/tts/google), [Gradium](https://docs.pipecat.ai/server/services/tts/gradium), [Groq](https://docs.pipecat.ai/server/services/tts/groq), [Hume](https://docs.pipecat.ai/server/services/tts/hume), [Inworld](https://docs.pipecat.ai/server/services/tts/inworld), [Kokoro](https://docs.pipecat.ai/server/services/tts/kokoro), [LMNT](https://docs.pipecat.ai/server/services/tts/lmnt), [MiniMax](https://docs.pipecat.ai/server/services/tts/minimax), [Neuphonic](https://docs.pipecat.ai/server/services/tts/neuphonic), [NVIDIA Riva](https://docs.pipecat.ai/server/services/tts/riva), [OpenAI](https://docs.pipecat.ai/server/services/tts/openai), [Piper](https://docs.pipecat.ai/server/services/tts/piper), [Resemble](https://docs.pipecat.ai/server/services/tts/resemble), [Rime](https://docs.pipecat.ai/server/services/tts/rime), [Sarvam](https://docs.pipecat.ai/server/services/tts/sarvam), [Smallest](https://docs.pipecat.ai/server/services/tts/smallest), [Speechmatics](https://docs.pipecat.ai/server/services/tts/speechmatics), [xAI](https://docs.pipecat.ai/server/services/tts/xai), [XTTS](https://docs.pipecat.ai/server/services/tts/xtts) | -| Speech-to-Speech | [AWS Nova Sonic](https://docs.pipecat.ai/server/services/s2s/aws), [Gemini Multimodal Live](https://docs.pipecat.ai/server/services/s2s/gemini), [Grok Voice Agent](https://docs.pipecat.ai/server/services/s2s/grok), [OpenAI Realtime](https://docs.pipecat.ai/server/services/s2s/openai), [Ultravox](https://docs.pipecat.ai/server/services/s2s/ultravox), | -| Transport | [Daily (WebRTC)](https://docs.pipecat.ai/server/services/transport/daily), [FastAPI Websocket](https://docs.pipecat.ai/server/services/transport/fastapi-websocket), [LiveKit (WebRTC)](https://docs.pipecat.ai/server/services/transport/livekit), [SmallWebRTCTransport](https://docs.pipecat.ai/server/services/transport/small-webrtc), [WebSocket Server](https://docs.pipecat.ai/server/services/transport/websocket-server), [WhatsApp](https://docs.pipecat.ai/server/services/transport/whatsapp), Local | -| Serializers | [Exotel](https://docs.pipecat.ai/server/services/serializers/exotel), [Genesys](https://docs.pipecat.ai/server/services/serializers/genesys), [Plivo](https://docs.pipecat.ai/server/services/serializers/plivo), [Twilio](https://docs.pipecat.ai/server/services/serializers/twilio), [Telnyx](https://docs.pipecat.ai/server/services/serializers/telnyx), [Vonage](https://docs.pipecat.ai/server/services/serializers/vonage) | -| Video | [HeyGen](https://docs.pipecat.ai/server/services/video/heygen), [LemonSlice](https://docs.pipecat.ai/server/services/transport/lemonslice), [Tavus](https://docs.pipecat.ai/server/services/video/tavus), [Simli](https://docs.pipecat.ai/server/services/video/simli) | -| Memory | [mem0](https://docs.pipecat.ai/server/services/memory/mem0) | -| Vision & Image | [fal](https://docs.pipecat.ai/server/services/image-generation/fal), [Google Imagen](https://docs.pipecat.ai/server/services/image-generation/google-imagen), [Moondream](https://docs.pipecat.ai/server/services/vision/moondream) | -| Audio Processing | [Silero VAD](https://docs.pipecat.ai/server/utilities/audio/silero-vad-analyzer), [Krisp Viva](https://docs.pipecat.ai/guides/features/krisp-viva), [Koala](https://docs.pipecat.ai/server/utilities/audio/koala-filter), [ai-coustics](https://docs.pipecat.ai/server/utilities/audio/aic-filter), [RNNoise](https://docs.pipecat.ai/server/utilities/audio/rnnoise-filter) | -| Analytics & Metrics | [OpenTelemetry](https://docs.pipecat.ai/server/utilities/opentelemetry), [Sentry](https://docs.pipecat.ai/server/services/analytics/sentry) | -| Community | [Browse community integrations →](https://docs.pipecat.ai/server/services/community-integrations) | +| Category | Services | +| ------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| Speech-to-Text | [AssemblyAI](https://docs.pipecat.ai/server/services/stt/assemblyai), [AWS](https://docs.pipecat.ai/server/services/stt/aws), [Azure](https://docs.pipecat.ai/server/services/stt/azure), [Cartesia](https://docs.pipecat.ai/server/services/stt/cartesia), [Deepgram](https://docs.pipecat.ai/server/services/stt/deepgram), [ElevenLabs](https://docs.pipecat.ai/server/services/stt/elevenlabs), [Fal Wizper](https://docs.pipecat.ai/server/services/stt/fal), [Gladia](https://docs.pipecat.ai/server/services/stt/gladia), [Google](https://docs.pipecat.ai/server/services/stt/google), [Gradium](https://docs.pipecat.ai/server/services/stt/gradium), [Groq (Whisper)](https://docs.pipecat.ai/server/services/stt/groq), [Mistral](https://docs.pipecat.ai/server/services/stt/mistral), [NVIDIA Riva](https://docs.pipecat.ai/server/services/stt/riva), [OpenAI (Whisper)](https://docs.pipecat.ai/server/services/stt/openai), [Sarvam](https://docs.pipecat.ai/server/services/stt/sarvam), [Soniox](https://docs.pipecat.ai/server/services/stt/soniox), [Speechmatics](https://docs.pipecat.ai/server/services/stt/speechmatics), [Whisper](https://docs.pipecat.ai/server/services/stt/whisper) | +| LLMs | [Anthropic](https://docs.pipecat.ai/server/services/llm/anthropic), [AWS](https://docs.pipecat.ai/server/services/llm/aws), [Azure](https://docs.pipecat.ai/server/services/llm/azure), [Cerebras](https://docs.pipecat.ai/server/services/llm/cerebras), [DeepSeek](https://docs.pipecat.ai/server/services/llm/deepseek), [Fireworks AI](https://docs.pipecat.ai/server/services/llm/fireworks), [Gemini](https://docs.pipecat.ai/server/services/llm/gemini), [Grok](https://docs.pipecat.ai/server/services/llm/grok), [Groq](https://docs.pipecat.ai/server/services/llm/groq), [Mistral](https://docs.pipecat.ai/server/services/llm/mistral), [Nebius](https://docs.pipecat.ai/server/services/llm/nebius), [Novita](https://docs.pipecat.ai/server/services/llm/novita), [NVIDIA NIM](https://docs.pipecat.ai/server/services/llm/nvidia), [Ollama](https://docs.pipecat.ai/server/services/llm/ollama), [OpenAI](https://docs.pipecat.ai/server/services/llm/openai), [OpenRouter](https://docs.pipecat.ai/server/services/llm/openrouter), [Perplexity](https://docs.pipecat.ai/server/services/llm/perplexity), [Qwen](https://docs.pipecat.ai/server/services/llm/qwen), [SambaNova](https://docs.pipecat.ai/server/services/llm/sambanova), [Sarvam](https://docs.pipecat.ai/server/services/llm/sarvam), [Together AI](https://docs.pipecat.ai/server/services/llm/together) | +| Text-to-Speech | [Async](https://docs.pipecat.ai/server/services/tts/asyncai), [AWS](https://docs.pipecat.ai/server/services/tts/aws), [Azure](https://docs.pipecat.ai/server/services/tts/azure), [Camb AI](https://docs.pipecat.ai/server/services/tts/camb), [Cartesia](https://docs.pipecat.ai/server/services/tts/cartesia), [Deepgram](https://docs.pipecat.ai/server/services/tts/deepgram), [ElevenLabs](https://docs.pipecat.ai/server/services/tts/elevenlabs), [Fish](https://docs.pipecat.ai/server/services/tts/fish), [Google](https://docs.pipecat.ai/server/services/tts/google), [Gradium](https://docs.pipecat.ai/server/services/tts/gradium), [Groq](https://docs.pipecat.ai/server/services/tts/groq), [Hume](https://docs.pipecat.ai/server/services/tts/hume), [Inworld](https://docs.pipecat.ai/server/services/tts/inworld), [Kokoro](https://docs.pipecat.ai/server/services/tts/kokoro), [LMNT](https://docs.pipecat.ai/server/services/tts/lmnt), [MiniMax](https://docs.pipecat.ai/server/services/tts/minimax), [Mistral](https://docs.pipecat.ai/server/services/tts/mistral), [Neuphonic](https://docs.pipecat.ai/server/services/tts/neuphonic), [NVIDIA Riva](https://docs.pipecat.ai/server/services/tts/riva), [OpenAI](https://docs.pipecat.ai/server/services/tts/openai), [Piper](https://docs.pipecat.ai/server/services/tts/piper), [Resemble](https://docs.pipecat.ai/server/services/tts/resemble), [Rime](https://docs.pipecat.ai/server/services/tts/rime), [Sarvam](https://docs.pipecat.ai/server/services/tts/sarvam), [Smallest](https://docs.pipecat.ai/server/services/tts/smallest), [Speechmatics](https://docs.pipecat.ai/server/services/tts/speechmatics), [xAI](https://docs.pipecat.ai/server/services/tts/xai), [XTTS](https://docs.pipecat.ai/server/services/tts/xtts) | +| Speech-to-Speech | [AWS Nova Sonic](https://docs.pipecat.ai/server/services/s2s/aws), [Gemini Multimodal Live](https://docs.pipecat.ai/server/services/s2s/gemini), [Grok Voice Agent](https://docs.pipecat.ai/server/services/s2s/grok), [OpenAI Realtime](https://docs.pipecat.ai/server/services/s2s/openai), [Ultravox](https://docs.pipecat.ai/server/services/s2s/ultravox), | +| Transport | [Daily (WebRTC)](https://docs.pipecat.ai/server/services/transport/daily), [FastAPI Websocket](https://docs.pipecat.ai/server/services/transport/fastapi-websocket), [LiveKit (WebRTC)](https://docs.pipecat.ai/server/services/transport/livekit), [SmallWebRTCTransport](https://docs.pipecat.ai/server/services/transport/small-webrtc), [WebSocket Server](https://docs.pipecat.ai/server/services/transport/websocket-server), [WhatsApp](https://docs.pipecat.ai/server/services/transport/whatsapp), Local | +| Serializers | [Exotel](https://docs.pipecat.ai/server/services/serializers/exotel), [Genesys](https://docs.pipecat.ai/server/services/serializers/genesys), [Plivo](https://docs.pipecat.ai/server/services/serializers/plivo), [Twilio](https://docs.pipecat.ai/server/services/serializers/twilio), [Telnyx](https://docs.pipecat.ai/server/services/serializers/telnyx), [Vonage](https://docs.pipecat.ai/server/services/serializers/vonage) | +| Video | [HeyGen](https://docs.pipecat.ai/server/services/video/heygen), [LemonSlice](https://docs.pipecat.ai/server/services/transport/lemonslice), [Tavus](https://docs.pipecat.ai/server/services/video/tavus), [Simli](https://docs.pipecat.ai/server/services/video/simli) | +| Memory | [mem0](https://docs.pipecat.ai/server/services/memory/mem0) | +| Vision & Image | [fal](https://docs.pipecat.ai/server/services/image-generation/fal), [Google Imagen](https://docs.pipecat.ai/server/services/image-generation/google-imagen), [Moondream](https://docs.pipecat.ai/server/services/vision/moondream) | +| Audio Processing | [Silero VAD](https://docs.pipecat.ai/server/utilities/audio/silero-vad-analyzer), [Krisp Viva](https://docs.pipecat.ai/guides/features/krisp-viva), [Koala](https://docs.pipecat.ai/server/utilities/audio/koala-filter), [ai-coustics](https://docs.pipecat.ai/server/utilities/audio/aic-filter), [RNNoise](https://docs.pipecat.ai/server/utilities/audio/rnnoise-filter) | +| Analytics & Metrics | [OpenTelemetry](https://docs.pipecat.ai/server/utilities/opentelemetry), [Sentry](https://docs.pipecat.ai/server/services/analytics/sentry) | +| Community | [Browse community integrations →](https://docs.pipecat.ai/server/services/community-integrations) | 📚 [View full services documentation →](https://docs.pipecat.ai/server/services/supported-services) From 215b2dc7f3d6aab398e2dca44384493cdf642bad Mon Sep 17 00:00:00 2001 From: Mark Backman Date: Tue, 7 Apr 2026 15:37:07 -0400 Subject: [PATCH 5/5] Add voice-mistral to evals --- scripts/evals/run-release-evals.py | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/evals/run-release-evals.py b/scripts/evals/run-release-evals.py index 4924828c7..3bf6a0dfd 100644 --- a/scripts/evals/run-release-evals.py +++ b/scripts/evals/run-release-evals.py @@ -147,6 +147,7 @@ TESTS_VOICE = [ ("voice/voice-kokoro.py", EVAL_SIMPLE_MATH), ("voice/voice-resemble.py", EVAL_SIMPLE_MATH), ("voice/voice-smallest.py", EVAL_SIMPLE_MATH), + ("voice/voice-mistral.py", EVAL_SIMPLE_MATH), ("voice/voice-openai-responses.py", EVAL_SIMPLE_MATH), ("voice/voice-openai-responses-http.py", EVAL_SIMPLE_MATH), # Needs a local XTTS docker instance running.