From 7f3f23dcb9582fb8ff65b32ca4bc38c567bd5f29 Mon Sep 17 00:00:00 2001 From: Mark Backman Date: Tue, 7 Apr 2026 09:12:47 -0400 Subject: [PATCH] Add Mistral Voxtral streaming TTS service Integrate with Mistral's Voxtral TTS API (voxtral-mini-tts-2603) using HTTP streaming with Server-Sent Events. Converts base64-encoded float32 PCM chunks from the API to int16 for the Pipecat pipeline. --- pyproject.toml | 2 +- src/pipecat/services/mistral/tts.py | 183 ++++++++++++++++++++++++++++ uv.lock | 62 +++++++--- 3 files changed, 231 insertions(+), 16 deletions(-) create mode 100644 src/pipecat/services/mistral/tts.py diff --git a/pyproject.toml b/pyproject.toml index dd5a028f4..b2fef1d7b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -88,7 +88,7 @@ local = [ "pyaudio~=0.2.14" ] local-smart-turn = [ "coremltools>=8.0", "transformers>=4.48.0,<6", "torch>=2.5.0,<3", "torchaudio>=2.5.0,<3" ] mcp = [ "mcp[cli]>=1.11.0,<2" ] mem0 = [ "mem0ai>=1.0.8,<2" ] -mistral = [] +mistral = ["mistralai>=2.0.0,<3"] mlx-whisper = [ "mlx-whisper~=0.4.2" ] moondream = [ "accelerate~=1.10.0", "einops~=0.8.0", "pyvips[binary]~=3.0.0", "timm~=1.0.13", "transformers>=4.48.0,<6" ] nebius = [] diff --git a/src/pipecat/services/mistral/tts.py b/src/pipecat/services/mistral/tts.py new file mode 100644 index 000000000..f2a49517a --- /dev/null +++ b/src/pipecat/services/mistral/tts.py @@ -0,0 +1,183 @@ +# +# Copyright (c) 2024-2026, Daily +# +# SPDX-License-Identifier: BSD 2-Clause License +# + +"""Mistral text-to-speech service implementation. + +This module provides integration with Mistral's Voxtral TTS API for +generating speech from text input using HTTP streaming with Server-Sent Events. +""" + +import base64 +import struct +from dataclasses import dataclass +from typing import AsyncGenerator, Optional + +from loguru import logger + +from pipecat.frames.frames import ( + ErrorFrame, + Frame, + TTSAudioRawFrame, +) +from pipecat.services.settings import TTSSettings +from pipecat.services.tts_service import TTSService +from pipecat.utils.tracing.service_decorators import traced_tts + +try: + from mistralai.client import Mistral +except ModuleNotFoundError as e: + logger.error(f"Exception: {e}") + logger.error("In order to use Mistral TTS, you need to `pip install pipecat-ai[mistral]`.") + raise Exception(f"Missing module: {e}") + + +@dataclass +class MistralTTSSettings(TTSSettings): + """Settings for MistralTTSService. + + Parameters: + model: TTS model identifier. + voice: Voice identifier. + language: Language for speech synthesis. + """ + + pass + + +class MistralTTSService(TTSService): + """Mistral Text-to-Speech service using the Voxtral TTS API. + + This service uses Mistral's streaming TTS API to generate PCM-encoded audio + at 24kHz. The API returns base64-encoded float32 PCM chunks via Server-Sent + Events, which are converted to int16 for the Pipecat pipeline. + """ + + Settings = MistralTTSSettings + _settings: Settings + + MISTRAL_SAMPLE_RATE = 24000 + + def __init__( + self, + *, + api_key: Optional[str] = None, + voice_id: Optional[str] = None, + model: Optional[str] = None, + sample_rate: Optional[int] = None, + settings: Optional[Settings] = None, + **kwargs, + ): + """Initialize Mistral TTS service. + + Args: + api_key: Mistral API key for authentication. If None, uses + MISTRAL_API_KEY environment variable. + voice_id: Voice ID to use for synthesis. + + .. deprecated:: 0.0.105 + Use ``settings=MistralTTSService.Settings(voice=...)`` instead. + + model: TTS model to use. Defaults to "voxtral-mini-tts-2603". + + .. deprecated:: 0.0.105 + Use ``settings=MistralTTSService.Settings(model=...)`` instead. + + sample_rate: Output audio sample rate in Hz. Audio is resampled from + Mistral's native 24kHz when a different rate is requested. + settings: Runtime-updatable settings. When provided alongside deprecated + parameters, ``settings`` values take precedence. + **kwargs: Additional keyword arguments passed to TTSService. + """ + # 1. Initialize default_settings with hardcoded defaults + default_settings = self.Settings( + model="voxtral-mini-tts-2603", + voice=None, + language=None, + ) + + # 2. Apply direct init arg overrides (deprecated) + if voice_id is not None: + self._warn_init_param_moved_to_settings("voice_id", "voice") + default_settings.voice = voice_id + if model is not None: + self._warn_init_param_moved_to_settings("model", "model") + default_settings.model = model + + # 3. Apply settings delta (canonical API, always wins) + if settings is not None: + default_settings.apply_update(settings) + + super().__init__( + sample_rate=sample_rate, + push_start_frame=True, + push_stop_frames=True, + settings=default_settings, + **kwargs, + ) + + self._client = Mistral(api_key=api_key) + + def can_generate_metrics(self) -> bool: + """Check if this service can generate processing metrics. + + Returns: + True, as Mistral TTS service supports metrics generation. + """ + return True + + @staticmethod + def _float32_to_int16(data: bytes) -> bytes: + """Convert float32 PCM audio data to int16 PCM. + + Args: + data: Raw bytes containing float32 LE PCM samples. + + Returns: + Raw bytes containing int16 LE PCM samples. + """ + n = len(data) // 4 + floats = struct.unpack(f"<{n}f", data) + return struct.pack(f"<{n}h", *(min(32767, max(-32768, int(f * 32767))) for f in floats)) + + @traced_tts + async def run_tts(self, text: str, context_id: str) -> AsyncGenerator[Frame, None]: + """Generate speech from text using Mistral's TTS API. + + Args: + text: The text to synthesize into speech. + context_id: The context ID for tracking audio frames. + + Yields: + Frame: Audio frames containing the synthesized speech data. + """ + logger.debug(f"{self}: Generating TTS [{text}]") + try: + await self.start_tts_usage_metrics(text) + + async with await self._client.audio.speech.complete_async( + input=text, + model=self._settings.model, + voice_id=self._settings.voice, + response_format="pcm", + stream=True, + ) as event_stream: + async for event in event_stream: + if event.event == "speech.audio.delta": + audio_bytes = base64.b64decode(event.data.audio_data) + audio_int16 = self._float32_to_int16(audio_bytes) + audio_data = await self._resampler.resample( + audio_int16, self.MISTRAL_SAMPLE_RATE, self.sample_rate + ) + await self.stop_ttfb_metrics() + yield TTSAudioRawFrame( + audio_data, self.sample_rate, 1, context_id=context_id + ) + elif event.event == "speech.audio.done": + if hasattr(event.data, "usage") and event.data.usage: + logger.debug(f"{self}: Usage info: {event.data.usage}") + except Exception as e: + logger.error(f"{self} error generating TTS: {e}") + yield ErrorFrame(error=f"Error generating TTS: {e}") diff --git a/uv.lock b/uv.lock index 588a7bca9..1445296ad 100644 --- a/uv.lock +++ b/uv.lock @@ -2560,6 +2560,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/73/07/02e16ed01e04a374e644b575638ec7987ae846d25ad97bcc9945a3ee4b0e/jsonpatch-1.33-py2.py3-none-any.whl", hash = "sha256:0ae28c0cd062bbd8b8ecc26d7d164fbbea9652a1a3693f3b956c1eae5145dade", size = 12898, upload-time = "2023-06-16T21:01:28.466Z" }, ] +[[package]] +name = "jsonpath-python" +version = "1.1.5" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/2d/db/2f4ecc24da35c6142b39c353d5b7c16eef955cc94b35a48d3fa47996d7c3/jsonpath_python-1.1.5.tar.gz", hash = "sha256:ceea2efd9e56add09330a2c9631ea3d55297b9619348c1055e5bfb9cb0b8c538", size = 87352, upload-time = "2026-03-17T06:16:40.597Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/28/50/1a313fb700526b134c71eb8a225d8b83be0385dbb0204337b4379c698cef/jsonpath_python-1.1.5-py3-none-any.whl", hash = "sha256:a60315404d70a65e76c9a782c84e50600480221d94a58af47b7b4d437351cb4b", size = 14090, upload-time = "2026-03-17T06:16:39.152Z" }, +] + [[package]] name = "jsonpointer" version = "3.1.1" @@ -3220,6 +3229,25 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/eb/e6/2c6ea68c404757e683da23b942dfff6987fe283ccbf2fa1fb0c128ddbdc6/mem0ai-1.0.10-py3-none-any.whl", hash = "sha256:9ff586c3a39a834042ce6755fc9da2315e284fb622ee773cd344ecca756ccad5", size = 295374, upload-time = "2026-04-01T18:23:25.022Z" }, ] +[[package]] +name = "mistralai" +version = "2.3.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "eval-type-backport" }, + { name = "httpx" }, + { name = "jsonpath-python" }, + { name = "opentelemetry-api" }, + { name = "opentelemetry-semantic-conventions" }, + { name = "pydantic" }, + { name = "python-dateutil" }, + { name = "typing-inspection" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/4d/05/40c38c8893f0ec858756b30f4a939378fc62cf33565af538a843497f3f24/mistralai-2.3.0.tar.gz", hash = "sha256:eb371a9b3b62552f3d4a274ecf5b2c48b90fd3439ecd1425e7f5163cdd87e29a", size = 387145, upload-time = "2026-04-03T15:06:48.927Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/bd/57/d06cbfd96ec6dc45d5c1fe9456f7fcfcb9549c9fa91e213561d1d88729e7/mistralai-2.3.0-py3-none-any.whl", hash = "sha256:22111747c215f1632141660151924f06579f87cd8db2649e0b1f87721d076851", size = 925544, upload-time = "2026-04-03T15:06:47.593Z" }, +] + [[package]] name = "mlx" version = "0.31.1" @@ -3806,20 +3834,20 @@ wheels = [ [[package]] name = "opentelemetry-api" -version = "1.40.0" +version = "1.39.1" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "importlib-metadata" }, { name = "typing-extensions" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/2c/1d/4049a9e8698361cc1a1aa03a6c59e4fa4c71e0c0f94a30f988a6876a2ae6/opentelemetry_api-1.40.0.tar.gz", hash = "sha256:159be641c0b04d11e9ecd576906462773eb97ae1b657730f0ecf64d32071569f", size = 70851, upload-time = "2026-03-04T14:17:21.555Z" } +sdist = { url = "https://files.pythonhosted.org/packages/97/b9/3161be15bb8e3ad01be8be5a968a9237c3027c5be504362ff800fca3e442/opentelemetry_api-1.39.1.tar.gz", hash = "sha256:fbde8c80e1b937a2c61f20347e91c0c18a1940cecf012d62e65a7caf08967c9c", size = 65767, upload-time = "2025-12-11T13:32:39.182Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/5f/bf/93795954016c522008da367da292adceed71cca6ee1717e1d64c83089099/opentelemetry_api-1.40.0-py3-none-any.whl", hash = "sha256:82dd69331ae74b06f6a874704be0cfaa49a1650e1537d4a813b86ecef7d0ecf9", size = 68676, upload-time = "2026-03-04T14:17:01.24Z" }, + { url = "https://files.pythonhosted.org/packages/cf/df/d3f1ddf4bb4cb50ed9b1139cc7b1c54c34a1e7ce8fd1b9a37c0d1551a6bd/opentelemetry_api-1.39.1-py3-none-any.whl", hash = "sha256:2edd8463432a7f8443edce90972169b195e7d6a05500cd29e6d13898187c9950", size = 66356, upload-time = "2025-12-11T13:32:17.304Z" }, ] [[package]] name = "opentelemetry-instrumentation" -version = "0.61b0" +version = "0.60b1" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "opentelemetry-api" }, @@ -3827,50 +3855,50 @@ dependencies = [ { name = "packaging" }, { name = "wrapt" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/da/37/6bf8e66bfcee5d3c6515b79cb2ee9ad05fe573c20f7ceb288d0e7eeec28c/opentelemetry_instrumentation-0.61b0.tar.gz", hash = "sha256:cb21b48db738c9de196eba6b805b4ff9de3b7f187e4bbf9a466fa170514f1fc7", size = 32606, upload-time = "2026-03-04T14:20:16.825Z" } +sdist = { url = "https://files.pythonhosted.org/packages/41/0f/7e6b713ac117c1f5e4e3300748af699b9902a2e5e34c9cf443dde25a01fa/opentelemetry_instrumentation-0.60b1.tar.gz", hash = "sha256:57ddc7974c6eb35865af0426d1a17132b88b2ed8586897fee187fd5b8944bd6a", size = 31706, upload-time = "2025-12-11T13:36:42.515Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/d8/3e/f6f10f178b6316de67f0dfdbbb699a24fbe8917cf1743c1595fb9dcdd461/opentelemetry_instrumentation-0.61b0-py3-none-any.whl", hash = "sha256:92a93a280e69788e8f88391247cc530fd81f16f2b011979d4d6398f805cfbc63", size = 33448, upload-time = "2026-03-04T14:19:02.447Z" }, + { url = "https://files.pythonhosted.org/packages/77/d2/6788e83c5c86a2690101681aeef27eeb2a6bf22df52d3f263a22cee20915/opentelemetry_instrumentation-0.60b1-py3-none-any.whl", hash = "sha256:04480db952b48fb1ed0073f822f0ee26012b7be7c3eac1a3793122737c78632d", size = 33096, upload-time = "2025-12-11T13:35:33.067Z" }, ] [[package]] name = "opentelemetry-instrumentation-threading" -version = "0.61b0" +version = "0.60b1" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "opentelemetry-api" }, { name = "opentelemetry-instrumentation" }, { name = "wrapt" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/12/8f/8dedba66100cda58af057926449a5e58e6c008bec02bc2746c03c3d85dcd/opentelemetry_instrumentation_threading-0.61b0.tar.gz", hash = "sha256:38e0263c692d15a7a458b3fa0286d29290448fa4ac4c63045edac438c6113433", size = 9163, upload-time = "2026-03-04T14:20:50.546Z" } +sdist = { url = "https://files.pythonhosted.org/packages/9b/0a/e36123ec4c0910a3936b92982545a53e9bca5b26a28df06883751a783f84/opentelemetry_instrumentation_threading-0.60b1.tar.gz", hash = "sha256:20b18a68abe5801fa9474336b7c27487d4af3e00b66f6a8734e4fdd75c8b0b43", size = 8768, upload-time = "2025-12-11T13:37:16.29Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/e8/77/c06d960aede1a014812aa4fafde0ae546d790f46416fbeafa2b32095aae3/opentelemetry_instrumentation_threading-0.61b0-py3-none-any.whl", hash = "sha256:735f4a1dc964202fc8aff475efc12bb64e6566f22dff52d5cb5de864b3fe1a70", size = 9337, upload-time = "2026-03-04T14:19:57.983Z" }, + { url = "https://files.pythonhosted.org/packages/c7/a3/448738b927bcc1843ace7d4ed55dd54441a71363075eeeee89c5944dd740/opentelemetry_instrumentation_threading-0.60b1-py3-none-any.whl", hash = "sha256:92a52a60fee5e32bc6aa8f5acd749b15691ad0bc4457a310f5736b76a6d9d1de", size = 9312, upload-time = "2025-12-11T13:36:28.434Z" }, ] [[package]] name = "opentelemetry-sdk" -version = "1.40.0" +version = "1.39.1" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "opentelemetry-api" }, { name = "opentelemetry-semantic-conventions" }, { name = "typing-extensions" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/58/fd/3c3125b20ba18ce2155ba9ea74acb0ae5d25f8cd39cfd37455601b7955cc/opentelemetry_sdk-1.40.0.tar.gz", hash = "sha256:18e9f5ec20d859d268c7cb3c5198c8d105d073714db3de50b593b8c1345a48f2", size = 184252, upload-time = "2026-03-04T14:17:31.87Z" } +sdist = { url = "https://files.pythonhosted.org/packages/eb/fb/c76080c9ba07e1e8235d24cdcc4d125ef7aa3edf23eb4e497c2e50889adc/opentelemetry_sdk-1.39.1.tar.gz", hash = "sha256:cf4d4563caf7bff906c9f7967e2be22d0d6b349b908be0d90fb21c8e9c995cc6", size = 171460, upload-time = "2025-12-11T13:32:49.369Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/2c/c5/6a852903d8bfac758c6dc6e9a68b015d3c33f2f1be5e9591e0f4b69c7e0a/opentelemetry_sdk-1.40.0-py3-none-any.whl", hash = "sha256:787d2154a71f4b3d81f20524a8ce061b7db667d24e46753f32a7bc48f1c1f3f1", size = 141951, upload-time = "2026-03-04T14:17:17.961Z" }, + { url = "https://files.pythonhosted.org/packages/7c/98/e91cf858f203d86f4eccdf763dcf01cf03f1dae80c3750f7e635bfa206b6/opentelemetry_sdk-1.39.1-py3-none-any.whl", hash = "sha256:4d5482c478513ecb0a5d938dcc61394e647066e0cc2676bee9f3af3f3f45f01c", size = 132565, upload-time = "2025-12-11T13:32:35.069Z" }, ] [[package]] name = "opentelemetry-semantic-conventions" -version = "0.61b0" +version = "0.60b1" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "opentelemetry-api" }, { name = "typing-extensions" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/6d/c0/4ae7973f3c2cfd2b6e321f1675626f0dab0a97027cc7a297474c9c8f3d04/opentelemetry_semantic_conventions-0.61b0.tar.gz", hash = "sha256:072f65473c5d7c6dc0355b27d6c9d1a679d63b6d4b4b16a9773062cb7e31192a", size = 145755, upload-time = "2026-03-04T14:17:32.664Z" } +sdist = { url = "https://files.pythonhosted.org/packages/91/df/553f93ed38bf22f4b999d9be9c185adb558982214f33eae539d3b5cd0858/opentelemetry_semantic_conventions-0.60b1.tar.gz", hash = "sha256:87c228b5a0669b748c76d76df6c364c369c28f1c465e50f661e39737e84bc953", size = 137935, upload-time = "2025-12-11T13:32:50.487Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/b2/37/cc6a55e448deaa9b27377d087da8615a3416d8ad523d5960b78dbeadd02a/opentelemetry_semantic_conventions-0.61b0-py3-none-any.whl", hash = "sha256:fa530a96be229795f8cef353739b618148b0fe2b4b3f005e60e262926c4d38e2", size = 231621, upload-time = "2026-03-04T14:17:19.33Z" }, + { url = "https://files.pythonhosted.org/packages/7a/5e/5958555e09635d09b75de3c4f8b9cae7335ca545d77392ffe7331534c402/opentelemetry_semantic_conventions-0.60b1-py3-none-any.whl", hash = "sha256:9fa8c8b0c110da289809292b0591220d3a7b53c1526a23021e977d68597893fb", size = 219982, upload-time = "2025-12-11T13:32:36.955Z" }, ] [[package]] @@ -4259,6 +4287,9 @@ mcp = [ mem0 = [ { name = "mem0ai" }, ] +mistral = [ + { name = "mistralai" }, +] mlx-whisper = [ { name = "mlx-whisper" }, ] @@ -4413,6 +4444,7 @@ requires-dist = [ { name = "markdown", specifier = ">=3.7,<4" }, { name = "mcp", extras = ["cli"], marker = "extra == 'mcp'", specifier = ">=1.11.0,<2" }, { name = "mem0ai", marker = "extra == 'mem0'", specifier = ">=1.0.8,<2" }, + { name = "mistralai", marker = "extra == 'mistral'", specifier = ">=2.0.0,<3" }, { name = "mlx-whisper", marker = "extra == 'mlx-whisper'", specifier = "~=0.4.2" }, { name = "nltk", specifier = ">=3.9.4,<4" }, { name = "numba", specifier = ">=0.61.2,<1" },