Fix sample_rate issue in ElevenLabsRealtimeSTTService, add timestamps and logging
This commit is contained in:
37
CHANGELOG.md
37
CHANGELOG.md
@@ -9,6 +9,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||
|
||||
### Added
|
||||
|
||||
- Added support for `include_timestamps` and `enable_logging` in
|
||||
`ElevenLabsRealtimeSTTService`. When `include_timestamps` is enabled,
|
||||
timestamp data is included in the `TranscriptionFrame`'s `result`
|
||||
parameter.
|
||||
|
||||
- Added optional speaking rate control to `InworldTTSService`.
|
||||
|
||||
- Introduced a new `AggregatedTextFrame` type to support passing text along with
|
||||
@@ -144,11 +149,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||
for how a match should be handled.
|
||||
|
||||
- `REMOVE`: The text along with its delimiters will be removed from the
|
||||
streaming text. Sentence aggregation will continue on as if this text
|
||||
streaming text. Sentence aggregation will continue on as if this text
|
||||
did not exist.
|
||||
|
||||
- `KEEP`: The delimiters will be removed, but the content between them
|
||||
will be kept. Sentence aggregation will continue on with the internal
|
||||
will be kept. Sentence aggregation will continue on with the internal
|
||||
text included.
|
||||
|
||||
- `AGGREGATE`: The delimiters will be removed and the content between will
|
||||
@@ -163,15 +168,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||
handlers.
|
||||
|
||||
- ⚠️ Breaking change: The `PatternMatch` type returned to handlers registered
|
||||
via `on_pattern_match` has been updated to subclass from the new
|
||||
`Aggregation` type, which means that `content` has been replaced with
|
||||
`text` and `pattern_id` has been replaced with `type`:
|
||||
via `on_pattern_match` has been updated to subclass from the new
|
||||
`Aggregation` type, which means that `content` has been replaced with
|
||||
`text` and `pattern_id` has been replaced with `type`:
|
||||
|
||||
```python
|
||||
async dev on_match_tag(match: PatternMatch):
|
||||
pattern = match.type # instead of match.pattern_id
|
||||
text = match.text # instead of match.content
|
||||
```
|
||||
```python
|
||||
async dev on_match_tag(match: PatternMatch):
|
||||
pattern = match.type # instead of match.pattern_id
|
||||
text = match.text # instead of match.content
|
||||
```
|
||||
|
||||
- `TextFrame` now includes the field `append_to_context` to support setting
|
||||
whether or not the encompassing text should be added to the LLM context (by
|
||||
@@ -236,6 +241,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||
- Fixed an issue in `ElevenLabsRealtimeSTTService` where dynamic language
|
||||
updates were not working.
|
||||
|
||||
- Fixed an issue in `ElevenLabsRealtimeSTTService` where setting the sample
|
||||
rate would result in transcripts failing.
|
||||
|
||||
- Fixed `InworldTTSService` audio config payload to use camelCase keys expected
|
||||
by the Inworld API.
|
||||
|
||||
@@ -297,11 +305,20 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||
- Updated language mappings for the Google and Gemini TTS services to match
|
||||
official documentation.
|
||||
|
||||
- In `MiniMaxHttpTTSService`:
|
||||
-- Added support for speech-2.6-hd and speech-2.6-turbo models
|
||||
-- Added languages: Afrikaans, Bulgarian, Catalan, Danish, Persian, Filipino, Hebrew,
|
||||
Croatian, Hungarian, Malay, Norwegian, Nynorsk, Slovak, Slovenian, Swedish, and Tamil
|
||||
-- Added new emotions: calm and fluent
|
||||
|
||||
### Deprecated
|
||||
|
||||
- The `api_key` parameter in `GeminiTTSService` is deprecated. Use
|
||||
`credentials` or `credentials_path` instead for Google Cloud authentication.
|
||||
|
||||
- `english_normalization` input parameter for `MiniMaxHttpTTSService` is deprecated,
|
||||
use `test_normalization` instead.
|
||||
|
||||
### Fixed
|
||||
|
||||
- Fixed a `SimliVideoService` connection issue.
|
||||
|
||||
@@ -416,6 +416,8 @@ class ElevenLabsRealtimeSTTService(WebsocketSTTService):
|
||||
Only used when commit_strategy is VAD. None uses ElevenLabs default.
|
||||
min_silence_duration_ms: Minimum silence duration for VAD (50-2000ms).
|
||||
Only used when commit_strategy is VAD. None uses ElevenLabs default.
|
||||
include_timestamps: Whether to include word-level timestamps in transcripts.
|
||||
enable_logging: Whether to enable logging on ElevenLabs' side.
|
||||
"""
|
||||
|
||||
language_code: Optional[str] = None
|
||||
@@ -424,6 +426,8 @@ class ElevenLabsRealtimeSTTService(WebsocketSTTService):
|
||||
vad_threshold: Optional[float] = None
|
||||
min_speech_duration_ms: Optional[int] = None
|
||||
min_silence_duration_ms: Optional[int] = None
|
||||
include_timestamps: bool = False
|
||||
enable_logging: bool = False
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -628,10 +632,16 @@ class ElevenLabsRealtimeSTTService(WebsocketSTTService):
|
||||
if self._params.language_code:
|
||||
params.append(f"language_code={self._params.language_code}")
|
||||
|
||||
params.append(f"encoding={self._audio_format}")
|
||||
params.append(f"sample_rate={self.sample_rate}")
|
||||
params.append(f"audio_format={self._audio_format}")
|
||||
params.append(f"commit_strategy={self._params.commit_strategy.value}")
|
||||
|
||||
# Add optional parameters
|
||||
if self._params.include_timestamps:
|
||||
params.append(f"include_timestamps={str(self._params.include_timestamps).lower()}")
|
||||
|
||||
if self._params.enable_logging:
|
||||
params.append(f"enable_logging={str(self._params.enable_logging).lower()}")
|
||||
|
||||
# Add VAD parameters if using VAD commit strategy and values are specified
|
||||
if self._params.commit_strategy == CommitStrategy.VAD:
|
||||
if self._params.vad_silence_threshold_secs is not None:
|
||||
@@ -720,15 +730,20 @@ class ElevenLabsRealtimeSTTService(WebsocketSTTService):
|
||||
elif message_type == "committed_transcript_with_timestamps":
|
||||
await self._on_committed_transcript_with_timestamps(data)
|
||||
|
||||
elif message_type == "input_error":
|
||||
error_msg = data.get("error", "Unknown input error")
|
||||
logger.error(f"ElevenLabs input error: {error_msg}")
|
||||
await self.push_error(ErrorFrame(f"Input error: {error_msg}"))
|
||||
elif message_type == "error":
|
||||
error_msg = data.get("error", "Unknown error")
|
||||
logger.error(f"ElevenLabs error: {error_msg}")
|
||||
await self.push_error(ErrorFrame(f"Error: {error_msg}"))
|
||||
|
||||
elif message_type in ["auth_error", "quota_exceeded", "transcriber_error", "error"]:
|
||||
error_msg = data.get("error", data.get("message", "Unknown error"))
|
||||
logger.error(f"ElevenLabs error ({message_type}): {error_msg}")
|
||||
await self.push_error(ErrorFrame(f"{message_type}: {error_msg}"))
|
||||
elif message_type == "auth_error":
|
||||
error_msg = data.get("error", "Authentication error")
|
||||
logger.error(f"ElevenLabs auth error: {error_msg}")
|
||||
await self.push_error(ErrorFrame(f"Auth error: {error_msg}"))
|
||||
|
||||
elif message_type == "quota_exceeded_error":
|
||||
error_msg = data.get("error", "Quota exceeded")
|
||||
logger.error(f"ElevenLabs quota exceeded: {error_msg}")
|
||||
await self.push_error(ErrorFrame(f"Quota exceeded: {error_msg}"))
|
||||
|
||||
else:
|
||||
logger.debug(f"Unknown message type: {message_type}")
|
||||
@@ -773,6 +788,11 @@ class ElevenLabsRealtimeSTTService(WebsocketSTTService):
|
||||
Args:
|
||||
data: Committed transcript data.
|
||||
"""
|
||||
# If timestamps are enabled, skip this message and wait for the
|
||||
# committed_transcript_with_timestamps message which contains all the data
|
||||
if self._params.include_timestamps:
|
||||
return
|
||||
|
||||
text = data.get("text", "").strip()
|
||||
if not text:
|
||||
return
|
||||
@@ -800,6 +820,18 @@ class ElevenLabsRealtimeSTTService(WebsocketSTTService):
|
||||
async def _on_committed_transcript_with_timestamps(self, data: dict):
|
||||
"""Handle committed transcript with word-level timestamps.
|
||||
|
||||
This message is sent when include_timestamps=true. The result data includes:
|
||||
- text: The transcribed text
|
||||
- language_code: Detected language (if available)
|
||||
- words: Array of word objects with timing information:
|
||||
- text: The word text
|
||||
- start: Start time in seconds
|
||||
- end: End time in seconds
|
||||
- type: "word" or "spacing"
|
||||
- speaker_id: Speaker identifier (if available)
|
||||
- logprob: Log probability score (if available)
|
||||
- characters: Array of character strings (if available)
|
||||
|
||||
Args:
|
||||
data: Committed transcript data with timestamps.
|
||||
"""
|
||||
@@ -807,9 +839,24 @@ class ElevenLabsRealtimeSTTService(WebsocketSTTService):
|
||||
if not text:
|
||||
return
|
||||
|
||||
logger.debug(f"Committed transcript with timestamps: [{text}]")
|
||||
logger.trace(f"Timestamps: {data.get('words', [])}")
|
||||
await self.stop_ttfb_metrics()
|
||||
await self.stop_processing_metrics()
|
||||
|
||||
# This is sent after the committed_transcript, so we don't need to
|
||||
# push another TranscriptionFrame, but we could use the timestamps
|
||||
# for additional processing if needed in the future
|
||||
# Get language if provided
|
||||
language = data.get("language_code")
|
||||
|
||||
logger.debug(f"Committed transcript with timestamps: [{text}]")
|
||||
|
||||
await self._handle_transcription(text, True, language)
|
||||
|
||||
# This message is sent after committed_transcript when include_timestamps=true.
|
||||
# It contains the full transcript data including text and word-level timestamps.
|
||||
await self.push_frame(
|
||||
TranscriptionFrame(
|
||||
text,
|
||||
self._user_id,
|
||||
time_now_iso8601(),
|
||||
language,
|
||||
result=data,
|
||||
)
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user