Fix sample_rate issue in ElevenLabsRealtimeSTTService, add timestamps and logging

This commit is contained in:
Mark Backman
2025-11-20 14:21:58 -05:00
parent 861567bc59
commit 7bdac02837
2 changed files with 89 additions and 25 deletions

View File

@@ -9,6 +9,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Added
- Added support for `include_timestamps` and `enable_logging` in
`ElevenLabsRealtimeSTTService`. When `include_timestamps` is enabled,
timestamp data is included in the `TranscriptionFrame`'s `result`
parameter.
- Added optional speaking rate control to `InworldTTSService`.
- Introduced a new `AggregatedTextFrame` type to support passing text along with
@@ -144,11 +149,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
for how a match should be handled.
- `REMOVE`: The text along with its delimiters will be removed from the
streaming text. Sentence aggregation will continue on as if this text
streaming text. Sentence aggregation will continue on as if this text
did not exist.
- `KEEP`: The delimiters will be removed, but the content between them
will be kept. Sentence aggregation will continue on with the internal
will be kept. Sentence aggregation will continue on with the internal
text included.
- `AGGREGATE`: The delimiters will be removed and the content between will
@@ -163,15 +168,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
handlers.
- ⚠️ Breaking change: The `PatternMatch` type returned to handlers registered
via `on_pattern_match` has been updated to subclass from the new
`Aggregation` type, which means that `content` has been replaced with
`text` and `pattern_id` has been replaced with `type`:
via `on_pattern_match` has been updated to subclass from the new
`Aggregation` type, which means that `content` has been replaced with
`text` and `pattern_id` has been replaced with `type`:
```python
async dev on_match_tag(match: PatternMatch):
pattern = match.type # instead of match.pattern_id
text = match.text # instead of match.content
```
```python
async dev on_match_tag(match: PatternMatch):
pattern = match.type # instead of match.pattern_id
text = match.text # instead of match.content
```
- `TextFrame` now includes the field `append_to_context` to support setting
whether or not the encompassing text should be added to the LLM context (by
@@ -236,6 +241,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Fixed an issue in `ElevenLabsRealtimeSTTService` where dynamic language
updates were not working.
- Fixed an issue in `ElevenLabsRealtimeSTTService` where setting the sample
rate would result in transcripts failing.
- Fixed `InworldTTSService` audio config payload to use camelCase keys expected
by the Inworld API.
@@ -297,11 +305,20 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Updated language mappings for the Google and Gemini TTS services to match
official documentation.
- In `MiniMaxHttpTTSService`:
-- Added support for speech-2.6-hd and speech-2.6-turbo models
-- Added languages: Afrikaans, Bulgarian, Catalan, Danish, Persian, Filipino, Hebrew,
Croatian, Hungarian, Malay, Norwegian, Nynorsk, Slovak, Slovenian, Swedish, and Tamil
-- Added new emotions: calm and fluent
### Deprecated
- The `api_key` parameter in `GeminiTTSService` is deprecated. Use
`credentials` or `credentials_path` instead for Google Cloud authentication.
- `english_normalization` input parameter for `MiniMaxHttpTTSService` is deprecated,
use `test_normalization` instead.
### Fixed
- Fixed a `SimliVideoService` connection issue.

View File

@@ -416,6 +416,8 @@ class ElevenLabsRealtimeSTTService(WebsocketSTTService):
Only used when commit_strategy is VAD. None uses ElevenLabs default.
min_silence_duration_ms: Minimum silence duration for VAD (50-2000ms).
Only used when commit_strategy is VAD. None uses ElevenLabs default.
include_timestamps: Whether to include word-level timestamps in transcripts.
enable_logging: Whether to enable logging on ElevenLabs' side.
"""
language_code: Optional[str] = None
@@ -424,6 +426,8 @@ class ElevenLabsRealtimeSTTService(WebsocketSTTService):
vad_threshold: Optional[float] = None
min_speech_duration_ms: Optional[int] = None
min_silence_duration_ms: Optional[int] = None
include_timestamps: bool = False
enable_logging: bool = False
def __init__(
self,
@@ -628,10 +632,16 @@ class ElevenLabsRealtimeSTTService(WebsocketSTTService):
if self._params.language_code:
params.append(f"language_code={self._params.language_code}")
params.append(f"encoding={self._audio_format}")
params.append(f"sample_rate={self.sample_rate}")
params.append(f"audio_format={self._audio_format}")
params.append(f"commit_strategy={self._params.commit_strategy.value}")
# Add optional parameters
if self._params.include_timestamps:
params.append(f"include_timestamps={str(self._params.include_timestamps).lower()}")
if self._params.enable_logging:
params.append(f"enable_logging={str(self._params.enable_logging).lower()}")
# Add VAD parameters if using VAD commit strategy and values are specified
if self._params.commit_strategy == CommitStrategy.VAD:
if self._params.vad_silence_threshold_secs is not None:
@@ -720,15 +730,20 @@ class ElevenLabsRealtimeSTTService(WebsocketSTTService):
elif message_type == "committed_transcript_with_timestamps":
await self._on_committed_transcript_with_timestamps(data)
elif message_type == "input_error":
error_msg = data.get("error", "Unknown input error")
logger.error(f"ElevenLabs input error: {error_msg}")
await self.push_error(ErrorFrame(f"Input error: {error_msg}"))
elif message_type == "error":
error_msg = data.get("error", "Unknown error")
logger.error(f"ElevenLabs error: {error_msg}")
await self.push_error(ErrorFrame(f"Error: {error_msg}"))
elif message_type in ["auth_error", "quota_exceeded", "transcriber_error", "error"]:
error_msg = data.get("error", data.get("message", "Unknown error"))
logger.error(f"ElevenLabs error ({message_type}): {error_msg}")
await self.push_error(ErrorFrame(f"{message_type}: {error_msg}"))
elif message_type == "auth_error":
error_msg = data.get("error", "Authentication error")
logger.error(f"ElevenLabs auth error: {error_msg}")
await self.push_error(ErrorFrame(f"Auth error: {error_msg}"))
elif message_type == "quota_exceeded_error":
error_msg = data.get("error", "Quota exceeded")
logger.error(f"ElevenLabs quota exceeded: {error_msg}")
await self.push_error(ErrorFrame(f"Quota exceeded: {error_msg}"))
else:
logger.debug(f"Unknown message type: {message_type}")
@@ -773,6 +788,11 @@ class ElevenLabsRealtimeSTTService(WebsocketSTTService):
Args:
data: Committed transcript data.
"""
# If timestamps are enabled, skip this message and wait for the
# committed_transcript_with_timestamps message which contains all the data
if self._params.include_timestamps:
return
text = data.get("text", "").strip()
if not text:
return
@@ -800,6 +820,18 @@ class ElevenLabsRealtimeSTTService(WebsocketSTTService):
async def _on_committed_transcript_with_timestamps(self, data: dict):
"""Handle committed transcript with word-level timestamps.
This message is sent when include_timestamps=true. The result data includes:
- text: The transcribed text
- language_code: Detected language (if available)
- words: Array of word objects with timing information:
- text: The word text
- start: Start time in seconds
- end: End time in seconds
- type: "word" or "spacing"
- speaker_id: Speaker identifier (if available)
- logprob: Log probability score (if available)
- characters: Array of character strings (if available)
Args:
data: Committed transcript data with timestamps.
"""
@@ -807,9 +839,24 @@ class ElevenLabsRealtimeSTTService(WebsocketSTTService):
if not text:
return
logger.debug(f"Committed transcript with timestamps: [{text}]")
logger.trace(f"Timestamps: {data.get('words', [])}")
await self.stop_ttfb_metrics()
await self.stop_processing_metrics()
# This is sent after the committed_transcript, so we don't need to
# push another TranscriptionFrame, but we could use the timestamps
# for additional processing if needed in the future
# Get language if provided
language = data.get("language_code")
logger.debug(f"Committed transcript with timestamps: [{text}]")
await self._handle_transcription(text, True, language)
# This message is sent after committed_transcript when include_timestamps=true.
# It contains the full transcript data including text and word-level timestamps.
await self.push_frame(
TranscriptionFrame(
text,
self._user_id,
time_now_iso8601(),
language,
result=data,
)
)