From 7bdac028375098fe06c2bbd51f43ffd390e86497 Mon Sep 17 00:00:00 2001 From: Mark Backman Date: Thu, 20 Nov 2025 14:21:58 -0500 Subject: [PATCH] Fix sample_rate issue in ElevenLabsRealtimeSTTService, add timestamps and logging --- CHANGELOG.md | 37 +++++++++---- src/pipecat/services/elevenlabs/stt.py | 77 +++++++++++++++++++++----- 2 files changed, 89 insertions(+), 25 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index aaaafd512..a40711642 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added +- Added support for `include_timestamps` and `enable_logging` in + `ElevenLabsRealtimeSTTService`. When `include_timestamps` is enabled, + timestamp data is included in the `TranscriptionFrame`'s `result` + parameter. + - Added optional speaking rate control to `InworldTTSService`. - Introduced a new `AggregatedTextFrame` type to support passing text along with @@ -144,11 +149,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 for how a match should be handled. - `REMOVE`: The text along with its delimiters will be removed from the - streaming text. Sentence aggregation will continue on as if this text + streaming text. Sentence aggregation will continue on as if this text did not exist. - `KEEP`: The delimiters will be removed, but the content between them - will be kept. Sentence aggregation will continue on with the internal + will be kept. Sentence aggregation will continue on with the internal text included. - `AGGREGATE`: The delimiters will be removed and the content between will @@ -163,15 +168,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 handlers. - ⚠️ Breaking change: The `PatternMatch` type returned to handlers registered - via `on_pattern_match` has been updated to subclass from the new - `Aggregation` type, which means that `content` has been replaced with - `text` and `pattern_id` has been replaced with `type`: + via `on_pattern_match` has been updated to subclass from the new + `Aggregation` type, which means that `content` has been replaced with + `text` and `pattern_id` has been replaced with `type`: - ```python - async dev on_match_tag(match: PatternMatch): - pattern = match.type # instead of match.pattern_id - text = match.text # instead of match.content - ``` + ```python + async dev on_match_tag(match: PatternMatch): + pattern = match.type # instead of match.pattern_id + text = match.text # instead of match.content + ``` - `TextFrame` now includes the field `append_to_context` to support setting whether or not the encompassing text should be added to the LLM context (by @@ -236,6 +241,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Fixed an issue in `ElevenLabsRealtimeSTTService` where dynamic language updates were not working. +- Fixed an issue in `ElevenLabsRealtimeSTTService` where setting the sample + rate would result in transcripts failing. + - Fixed `InworldTTSService` audio config payload to use camelCase keys expected by the Inworld API. @@ -297,11 +305,20 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Updated language mappings for the Google and Gemini TTS services to match official documentation. +- In `MiniMaxHttpTTSService`: + -- Added support for speech-2.6-hd and speech-2.6-turbo models + -- Added languages: Afrikaans, Bulgarian, Catalan, Danish, Persian, Filipino, Hebrew, + Croatian, Hungarian, Malay, Norwegian, Nynorsk, Slovak, Slovenian, Swedish, and Tamil + -- Added new emotions: calm and fluent + ### Deprecated - The `api_key` parameter in `GeminiTTSService` is deprecated. Use `credentials` or `credentials_path` instead for Google Cloud authentication. +- `english_normalization` input parameter for `MiniMaxHttpTTSService` is deprecated, + use `test_normalization` instead. + ### Fixed - Fixed a `SimliVideoService` connection issue. diff --git a/src/pipecat/services/elevenlabs/stt.py b/src/pipecat/services/elevenlabs/stt.py index 03929882e..95a802edd 100644 --- a/src/pipecat/services/elevenlabs/stt.py +++ b/src/pipecat/services/elevenlabs/stt.py @@ -416,6 +416,8 @@ class ElevenLabsRealtimeSTTService(WebsocketSTTService): Only used when commit_strategy is VAD. None uses ElevenLabs default. min_silence_duration_ms: Minimum silence duration for VAD (50-2000ms). Only used when commit_strategy is VAD. None uses ElevenLabs default. + include_timestamps: Whether to include word-level timestamps in transcripts. + enable_logging: Whether to enable logging on ElevenLabs' side. """ language_code: Optional[str] = None @@ -424,6 +426,8 @@ class ElevenLabsRealtimeSTTService(WebsocketSTTService): vad_threshold: Optional[float] = None min_speech_duration_ms: Optional[int] = None min_silence_duration_ms: Optional[int] = None + include_timestamps: bool = False + enable_logging: bool = False def __init__( self, @@ -628,10 +632,16 @@ class ElevenLabsRealtimeSTTService(WebsocketSTTService): if self._params.language_code: params.append(f"language_code={self._params.language_code}") - params.append(f"encoding={self._audio_format}") - params.append(f"sample_rate={self.sample_rate}") + params.append(f"audio_format={self._audio_format}") params.append(f"commit_strategy={self._params.commit_strategy.value}") + # Add optional parameters + if self._params.include_timestamps: + params.append(f"include_timestamps={str(self._params.include_timestamps).lower()}") + + if self._params.enable_logging: + params.append(f"enable_logging={str(self._params.enable_logging).lower()}") + # Add VAD parameters if using VAD commit strategy and values are specified if self._params.commit_strategy == CommitStrategy.VAD: if self._params.vad_silence_threshold_secs is not None: @@ -720,15 +730,20 @@ class ElevenLabsRealtimeSTTService(WebsocketSTTService): elif message_type == "committed_transcript_with_timestamps": await self._on_committed_transcript_with_timestamps(data) - elif message_type == "input_error": - error_msg = data.get("error", "Unknown input error") - logger.error(f"ElevenLabs input error: {error_msg}") - await self.push_error(ErrorFrame(f"Input error: {error_msg}")) + elif message_type == "error": + error_msg = data.get("error", "Unknown error") + logger.error(f"ElevenLabs error: {error_msg}") + await self.push_error(ErrorFrame(f"Error: {error_msg}")) - elif message_type in ["auth_error", "quota_exceeded", "transcriber_error", "error"]: - error_msg = data.get("error", data.get("message", "Unknown error")) - logger.error(f"ElevenLabs error ({message_type}): {error_msg}") - await self.push_error(ErrorFrame(f"{message_type}: {error_msg}")) + elif message_type == "auth_error": + error_msg = data.get("error", "Authentication error") + logger.error(f"ElevenLabs auth error: {error_msg}") + await self.push_error(ErrorFrame(f"Auth error: {error_msg}")) + + elif message_type == "quota_exceeded_error": + error_msg = data.get("error", "Quota exceeded") + logger.error(f"ElevenLabs quota exceeded: {error_msg}") + await self.push_error(ErrorFrame(f"Quota exceeded: {error_msg}")) else: logger.debug(f"Unknown message type: {message_type}") @@ -773,6 +788,11 @@ class ElevenLabsRealtimeSTTService(WebsocketSTTService): Args: data: Committed transcript data. """ + # If timestamps are enabled, skip this message and wait for the + # committed_transcript_with_timestamps message which contains all the data + if self._params.include_timestamps: + return + text = data.get("text", "").strip() if not text: return @@ -800,6 +820,18 @@ class ElevenLabsRealtimeSTTService(WebsocketSTTService): async def _on_committed_transcript_with_timestamps(self, data: dict): """Handle committed transcript with word-level timestamps. + This message is sent when include_timestamps=true. The result data includes: + - text: The transcribed text + - language_code: Detected language (if available) + - words: Array of word objects with timing information: + - text: The word text + - start: Start time in seconds + - end: End time in seconds + - type: "word" or "spacing" + - speaker_id: Speaker identifier (if available) + - logprob: Log probability score (if available) + - characters: Array of character strings (if available) + Args: data: Committed transcript data with timestamps. """ @@ -807,9 +839,24 @@ class ElevenLabsRealtimeSTTService(WebsocketSTTService): if not text: return - logger.debug(f"Committed transcript with timestamps: [{text}]") - logger.trace(f"Timestamps: {data.get('words', [])}") + await self.stop_ttfb_metrics() + await self.stop_processing_metrics() - # This is sent after the committed_transcript, so we don't need to - # push another TranscriptionFrame, but we could use the timestamps - # for additional processing if needed in the future + # Get language if provided + language = data.get("language_code") + + logger.debug(f"Committed transcript with timestamps: [{text}]") + + await self._handle_transcription(text, True, language) + + # This message is sent after committed_transcript when include_timestamps=true. + # It contains the full transcript data including text and word-level timestamps. + await self.push_frame( + TranscriptionFrame( + text, + self._user_id, + time_now_iso8601(), + language, + result=data, + ) + )