From 7bdac028375098fe06c2bbd51f43ffd390e86497 Mon Sep 17 00:00:00 2001
From: Mark Backman <mark@daily.co>
Date: Thu, 20 Nov 2025 14:21:58 -0500
Subject: [PATCH] Fix sample_rate issue in ElevenLabsRealtimeSTTService, add
 timestamps and logging

---
 CHANGELOG.md                           | 37 +++++++++----
 src/pipecat/services/elevenlabs/stt.py | 77 +++++++++++++++++++++-----
 2 files changed, 89 insertions(+), 25 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index aaaafd512..a40711642 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,6 +9,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Added
 
+- Added support for `include_timestamps` and `enable_logging` in
+  `ElevenLabsRealtimeSTTService`. When `include_timestamps` is enabled,
+  timestamp data is included in the `TranscriptionFrame`'s `result`
+  parameter.
+
 - Added optional speaking rate control to `InworldTTSService`.
 
 - Introduced a new `AggregatedTextFrame` type to support passing text along with
@@ -144,11 +149,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
       for how a match should be handled.
 
       - `REMOVE`: The text along with its delimiters will be removed from the
-        streaming text.  Sentence aggregation will continue on as if this text
+        streaming text. Sentence aggregation will continue on as if this text
         did not exist.
 
       - `KEEP`: The delimiters will be removed, but the content between them
-        will be kept.  Sentence aggregation will continue on with the internal
+        will be kept. Sentence aggregation will continue on with the internal
         text included.
 
       - `AGGREGATE`: The delimiters will be removed and the content between will
@@ -163,15 +168,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
       handlers.
 
   - ⚠️ Breaking change: The `PatternMatch` type returned to handlers registered
-     via `on_pattern_match` has been updated to subclass from the new
-     `Aggregation` type, which means that `content` has been replaced with
-     `text` and `pattern_id` has been replaced with `type`:
+    via `on_pattern_match` has been updated to subclass from the new
+    `Aggregation` type, which means that `content` has been replaced with
+    `text` and `pattern_id` has been replaced with `type`:
 
-     ```python
-     async dev on_match_tag(match: PatternMatch):
-        pattern = match.type # instead of match.pattern_id
-        text = match.text # instead of match.content
-     ```
+    ```python
+    async dev on_match_tag(match: PatternMatch):
+       pattern = match.type # instead of match.pattern_id
+       text = match.text # instead of match.content
+    ```
 
 - `TextFrame` now includes the field `append_to_context` to support setting
   whether or not the encompassing text should be added to the LLM context (by
@@ -236,6 +241,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Fixed an issue in `ElevenLabsRealtimeSTTService` where dynamic language
   updates were not working.
 
+- Fixed an issue in `ElevenLabsRealtimeSTTService` where setting the sample
+  rate would result in transcripts failing.
+
 - Fixed `InworldTTSService` audio config payload to use camelCase keys expected
   by the Inworld API.
 
@@ -297,11 +305,20 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Updated language mappings for the Google and Gemini TTS services to match
   official documentation.
 
+- In `MiniMaxHttpTTSService`:
+  -- Added support for speech-2.6-hd and speech-2.6-turbo models
+  -- Added languages: Afrikaans, Bulgarian, Catalan, Danish, Persian, Filipino, Hebrew,
+  Croatian, Hungarian, Malay, Norwegian, Nynorsk, Slovak, Slovenian, Swedish, and Tamil
+  -- Added new emotions: calm and fluent
+
 ### Deprecated
 
 - The `api_key` parameter in `GeminiTTSService` is deprecated. Use
   `credentials` or `credentials_path` instead for Google Cloud authentication.
 
+- `english_normalization` input parameter for `MiniMaxHttpTTSService` is deprecated,
+  use `test_normalization` instead.
+
 ### Fixed
 
 - Fixed a `SimliVideoService` connection issue.
diff --git a/src/pipecat/services/elevenlabs/stt.py b/src/pipecat/services/elevenlabs/stt.py
index 03929882e..95a802edd 100644
--- a/src/pipecat/services/elevenlabs/stt.py
+++ b/src/pipecat/services/elevenlabs/stt.py
@@ -416,6 +416,8 @@ class ElevenLabsRealtimeSTTService(WebsocketSTTService):
                 Only used when commit_strategy is VAD. None uses ElevenLabs default.
             min_silence_duration_ms: Minimum silence duration for VAD (50-2000ms).
                 Only used when commit_strategy is VAD. None uses ElevenLabs default.
+            include_timestamps: Whether to include word-level timestamps in transcripts.
+            enable_logging: Whether to enable logging on ElevenLabs' side.
         """
 
         language_code: Optional[str] = None
@@ -424,6 +426,8 @@ class ElevenLabsRealtimeSTTService(WebsocketSTTService):
         vad_threshold: Optional[float] = None
         min_speech_duration_ms: Optional[int] = None
         min_silence_duration_ms: Optional[int] = None
+        include_timestamps: bool = False
+        enable_logging: bool = False
 
     def __init__(
         self,
@@ -628,10 +632,16 @@ class ElevenLabsRealtimeSTTService(WebsocketSTTService):
             if self._params.language_code:
                 params.append(f"language_code={self._params.language_code}")
 
-            params.append(f"encoding={self._audio_format}")
-            params.append(f"sample_rate={self.sample_rate}")
+            params.append(f"audio_format={self._audio_format}")
             params.append(f"commit_strategy={self._params.commit_strategy.value}")
 
+            # Add optional parameters
+            if self._params.include_timestamps:
+                params.append(f"include_timestamps={str(self._params.include_timestamps).lower()}")
+
+            if self._params.enable_logging:
+                params.append(f"enable_logging={str(self._params.enable_logging).lower()}")
+
             # Add VAD parameters if using VAD commit strategy and values are specified
             if self._params.commit_strategy == CommitStrategy.VAD:
                 if self._params.vad_silence_threshold_secs is not None:
@@ -720,15 +730,20 @@ class ElevenLabsRealtimeSTTService(WebsocketSTTService):
         elif message_type == "committed_transcript_with_timestamps":
             await self._on_committed_transcript_with_timestamps(data)
 
-        elif message_type == "input_error":
-            error_msg = data.get("error", "Unknown input error")
-            logger.error(f"ElevenLabs input error: {error_msg}")
-            await self.push_error(ErrorFrame(f"Input error: {error_msg}"))
+        elif message_type == "error":
+            error_msg = data.get("error", "Unknown error")
+            logger.error(f"ElevenLabs error: {error_msg}")
+            await self.push_error(ErrorFrame(f"Error: {error_msg}"))
 
-        elif message_type in ["auth_error", "quota_exceeded", "transcriber_error", "error"]:
-            error_msg = data.get("error", data.get("message", "Unknown error"))
-            logger.error(f"ElevenLabs error ({message_type}): {error_msg}")
-            await self.push_error(ErrorFrame(f"{message_type}: {error_msg}"))
+        elif message_type == "auth_error":
+            error_msg = data.get("error", "Authentication error")
+            logger.error(f"ElevenLabs auth error: {error_msg}")
+            await self.push_error(ErrorFrame(f"Auth error: {error_msg}"))
+
+        elif message_type == "quota_exceeded_error":
+            error_msg = data.get("error", "Quota exceeded")
+            logger.error(f"ElevenLabs quota exceeded: {error_msg}")
+            await self.push_error(ErrorFrame(f"Quota exceeded: {error_msg}"))
 
         else:
             logger.debug(f"Unknown message type: {message_type}")
@@ -773,6 +788,11 @@ class ElevenLabsRealtimeSTTService(WebsocketSTTService):
         Args:
             data: Committed transcript data.
         """
+        # If timestamps are enabled, skip this message and wait for the
+        # committed_transcript_with_timestamps message which contains all the data
+        if self._params.include_timestamps:
+            return
+
         text = data.get("text", "").strip()
         if not text:
             return
@@ -800,6 +820,18 @@ class ElevenLabsRealtimeSTTService(WebsocketSTTService):
     async def _on_committed_transcript_with_timestamps(self, data: dict):
         """Handle committed transcript with word-level timestamps.
 
+        This message is sent when include_timestamps=true. The result data includes:
+        - text: The transcribed text
+        - language_code: Detected language (if available)
+        - words: Array of word objects with timing information:
+            - text: The word text
+            - start: Start time in seconds
+            - end: End time in seconds
+            - type: "word" or "spacing"
+            - speaker_id: Speaker identifier (if available)
+            - logprob: Log probability score (if available)
+            - characters: Array of character strings (if available)
+
         Args:
             data: Committed transcript data with timestamps.
         """
@@ -807,9 +839,24 @@ class ElevenLabsRealtimeSTTService(WebsocketSTTService):
         if not text:
             return
 
-        logger.debug(f"Committed transcript with timestamps: [{text}]")
-        logger.trace(f"Timestamps: {data.get('words', [])}")
+        await self.stop_ttfb_metrics()
+        await self.stop_processing_metrics()
 
-        # This is sent after the committed_transcript, so we don't need to
-        # push another TranscriptionFrame, but we could use the timestamps
-        # for additional processing if needed in the future
+        # Get language if provided
+        language = data.get("language_code")
+
+        logger.debug(f"Committed transcript with timestamps: [{text}]")
+
+        await self._handle_transcription(text, True, language)
+
+        # This message is sent after committed_transcript when include_timestamps=true.
+        # It contains the full transcript data including text and word-level timestamps.
+        await self.push_frame(
+            TranscriptionFrame(
+                text,
+                self._user_id,
+                time_now_iso8601(),
+                language,
+                result=data,
+            )
+        )