Fix sample_rate issue in ElevenLabsRealtimeSTTService, add timestamps and logging

2025-11-20 14:21:58 -05:00
parent 861567bc59
commit 7bdac02837
2 changed files with 89 additions and 25 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,6 +9,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

 ### Added

+- Added support for `include_timestamps` and `enable_logging` in
+  `ElevenLabsRealtimeSTTService`. When `include_timestamps` is enabled,
+  timestamp data is included in the `TranscriptionFrame`'s `result`
+  parameter.
+
 - Added optional speaking rate control to `InworldTTSService`.

 - Introduced a new `AggregatedTextFrame` type to support passing text along with
@@ -144,11 +149,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
      for how a match should be handled.

      - `REMOVE`: The text along with its delimiters will be removed from the
-        streaming text.  Sentence aggregation will continue on as if this text
+        streaming text. Sentence aggregation will continue on as if this text
        did not exist.

      - `KEEP`: The delimiters will be removed, but the content between them
-        will be kept.  Sentence aggregation will continue on with the internal
+        will be kept. Sentence aggregation will continue on with the internal
        text included.

      - `AGGREGATE`: The delimiters will be removed and the content between will
@@ -163,15 +168,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
      handlers.

  - ⚠️ Breaking change: The `PatternMatch` type returned to handlers registered
-     via `on_pattern_match` has been updated to subclass from the new
-     `Aggregation` type, which means that `content` has been replaced with
-     `text` and `pattern_id` has been replaced with `type`:
+    via `on_pattern_match` has been updated to subclass from the new
+    `Aggregation` type, which means that `content` has been replaced with
+    `text` and `pattern_id` has been replaced with `type`:

-     ```python
-     async dev on_match_tag(match: PatternMatch):
-        pattern = match.type # instead of match.pattern_id
-        text = match.text # instead of match.content
-     ```
+    ```python
+    async dev on_match_tag(match: PatternMatch):
+       pattern = match.type # instead of match.pattern_id
+       text = match.text # instead of match.content
+    ```

 - `TextFrame` now includes the field `append_to_context` to support setting
  whether or not the encompassing text should be added to the LLM context (by
@@ -236,6 +241,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Fixed an issue in `ElevenLabsRealtimeSTTService` where dynamic language
  updates were not working.

+- Fixed an issue in `ElevenLabsRealtimeSTTService` where setting the sample
+  rate would result in transcripts failing.
+
 - Fixed `InworldTTSService` audio config payload to use camelCase keys expected
  by the Inworld API.

@@ -297,11 +305,20 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Updated language mappings for the Google and Gemini TTS services to match
  official documentation.

+- In `MiniMaxHttpTTSService`:
+  -- Added support for speech-2.6-hd and speech-2.6-turbo models
+  -- Added languages: Afrikaans, Bulgarian, Catalan, Danish, Persian, Filipino, Hebrew,
+  Croatian, Hungarian, Malay, Norwegian, Nynorsk, Slovak, Slovenian, Swedish, and Tamil
+  -- Added new emotions: calm and fluent
+
 ### Deprecated

 - The `api_key` parameter in `GeminiTTSService` is deprecated. Use
  `credentials` or `credentials_path` instead for Google Cloud authentication.

+- `english_normalization` input parameter for `MiniMaxHttpTTSService` is deprecated,
+  use `test_normalization` instead.
+
 ### Fixed

 - Fixed a `SimliVideoService` connection issue.
--- a/src/pipecat/services/elevenlabs/stt.py
+++ b/src/pipecat/services/elevenlabs/stt.py
@@ -416,6 +416,8 @@ class ElevenLabsRealtimeSTTService(WebsocketSTTService):
                Only used when commit_strategy is VAD. None uses ElevenLabs default.
            min_silence_duration_ms: Minimum silence duration for VAD (50-2000ms).
                Only used when commit_strategy is VAD. None uses ElevenLabs default.
+            include_timestamps: Whether to include word-level timestamps in transcripts.
+            enable_logging: Whether to enable logging on ElevenLabs' side.
        """

        language_code: Optional[str] = None
@@ -424,6 +426,8 @@ class ElevenLabsRealtimeSTTService(WebsocketSTTService):
        vad_threshold: Optional[float] = None
        min_speech_duration_ms: Optional[int] = None
        min_silence_duration_ms: Optional[int] = None
+        include_timestamps: bool = False
+        enable_logging: bool = False

    def __init__(
        self,
@@ -628,10 +632,16 @@ class ElevenLabsRealtimeSTTService(WebsocketSTTService):
            if self._params.language_code:
                params.append(f"language_code={self._params.language_code}")

-            params.append(f"encoding={self._audio_format}")
-            params.append(f"sample_rate={self.sample_rate}")
+            params.append(f"audio_format={self._audio_format}")
            params.append(f"commit_strategy={self._params.commit_strategy.value}")

+            # Add optional parameters
+            if self._params.include_timestamps:
+                params.append(f"include_timestamps={str(self._params.include_timestamps).lower()}")
+
+            if self._params.enable_logging:
+                params.append(f"enable_logging={str(self._params.enable_logging).lower()}")
+
            # Add VAD parameters if using VAD commit strategy and values are specified
            if self._params.commit_strategy == CommitStrategy.VAD:
                if self._params.vad_silence_threshold_secs is not None:
@@ -720,15 +730,20 @@ class ElevenLabsRealtimeSTTService(WebsocketSTTService):
        elif message_type == "committed_transcript_with_timestamps":
            await self._on_committed_transcript_with_timestamps(data)

-        elif message_type == "input_error":
-            error_msg = data.get("error", "Unknown input error")
-            logger.error(f"ElevenLabs input error: {error_msg}")
-            await self.push_error(ErrorFrame(f"Input error: {error_msg}"))
+        elif message_type == "error":
+            error_msg = data.get("error", "Unknown error")
+            logger.error(f"ElevenLabs error: {error_msg}")
+            await self.push_error(ErrorFrame(f"Error: {error_msg}"))

-        elif message_type in ["auth_error", "quota_exceeded", "transcriber_error", "error"]:
-            error_msg = data.get("error", data.get("message", "Unknown error"))
-            logger.error(f"ElevenLabs error ({message_type}): {error_msg}")
-            await self.push_error(ErrorFrame(f"{message_type}: {error_msg}"))
+        elif message_type == "auth_error":
+            error_msg = data.get("error", "Authentication error")
+            logger.error(f"ElevenLabs auth error: {error_msg}")
+            await self.push_error(ErrorFrame(f"Auth error: {error_msg}"))
+
+        elif message_type == "quota_exceeded_error":
+            error_msg = data.get("error", "Quota exceeded")
+            logger.error(f"ElevenLabs quota exceeded: {error_msg}")
+            await self.push_error(ErrorFrame(f"Quota exceeded: {error_msg}"))

        else:
            logger.debug(f"Unknown message type: {message_type}")
@@ -773,6 +788,11 @@ class ElevenLabsRealtimeSTTService(WebsocketSTTService):
        Args:
            data: Committed transcript data.
        """
+        # If timestamps are enabled, skip this message and wait for the
+        # committed_transcript_with_timestamps message which contains all the data
+        if self._params.include_timestamps:
+            return
+
        text = data.get("text", "").strip()
        if not text:
            return
@@ -800,6 +820,18 @@ class ElevenLabsRealtimeSTTService(WebsocketSTTService):
    async def _on_committed_transcript_with_timestamps(self, data: dict):
        """Handle committed transcript with word-level timestamps.

+        This message is sent when include_timestamps=true. The result data includes:
+        - text: The transcribed text
+        - language_code: Detected language (if available)
+        - words: Array of word objects with timing information:
+            - text: The word text
+            - start: Start time in seconds
+            - end: End time in seconds
+            - type: "word" or "spacing"
+            - speaker_id: Speaker identifier (if available)
+            - logprob: Log probability score (if available)
+            - characters: Array of character strings (if available)
+
        Args:
            data: Committed transcript data with timestamps.
        """
@@ -807,9 +839,24 @@ class ElevenLabsRealtimeSTTService(WebsocketSTTService):
        if not text:
            return

-        logger.debug(f"Committed transcript with timestamps: [{text}]")
-        logger.trace(f"Timestamps: {data.get('words', [])}")
+        await self.stop_ttfb_metrics()
+        await self.stop_processing_metrics()

-        # This is sent after the committed_transcript, so we don't need to
-        # push another TranscriptionFrame, but we could use the timestamps
-        # for additional processing if needed in the future
+        # Get language if provided
+        language = data.get("language_code")
+
+        logger.debug(f"Committed transcript with timestamps: [{text}]")
+
+        await self._handle_transcription(text, True, language)
+
+        # This message is sent after committed_transcript when include_timestamps=true.
+        # It contains the full transcript data including text and word-level timestamps.
+        await self.push_frame(
+            TranscriptionFrame(
+                text,
+                self._user_id,
+                time_now_iso8601(),
+                language,
+                result=data,
+            )
+        )